<a href="https://colab.research.google.com/github/Precious3ita/Top-spotify-listening/blob/main/Week7_NonLinear_Modeling_TMDb_Telco.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Week 7 Assessment — Non‑Linear Modeling
**Datasets:** TMDB (regression — predict log revenue) and Telco Customer Churn (classification).

This Colab notebook runs EDA, baseline linear models, non-linear models (Decision Tree, Random Forest, XGBoost, CatBoost), 5‑fold CV, hyperparameter tuning, and produces a results CSV ready for download.


## 1) Libraries needed

In [5]:
# Install required packages (may take a minute)
!pip install -q xgboost catboost scikit-learn pandas matplotlib seaborn

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score, classification_report
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor
print('Imports done')

Imports done


## 2) Upload datasets
Run and upload the two CSV files when prompted:
- `tmdb_5000_movies.csv` (TMDB dataset; https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata)
- `telco_customer_churn.csv` (Telco churn; https://www.kaggle.com/datasets/blastchar/telco-customer-churn)

## 3) TMDB: Prepare regression dataset (predict log revenue)

In [6]:
tmdb = pd.read_csv('tmdb_5000_movies.csv')
tmdb.shape


(4803, 20)

In [7]:
# Basic feature engineering for TMDB
import ast
tmdb['release_date'] = pd.to_datetime(tmdb['release_date'], errors='coerce')
tmdb['release_year'] = tmdb['release_date'].dt.year.fillna(0).astype(int)
def top_genre(genres_str):
    try:
        g = ast.literal_eval(genres_str)
        if isinstance(g, list) and len(g)>0:
            return g[0].get('name', 'Unknown')
    except:
        return 'Unknown'
    return 'Unknown'
tmdb['top_genre'] = tmdb['genres'].apply(top_genre)
tmdb = tmdb[(tmdb['revenue']>0) & (tmdb['budget']>0)]
tmdb['log_revenue'] = np.log1p(tmdb['revenue'])
tmdb[['budget','popularity','vote_average','vote_count','runtime','release_year','top_genre','log_revenue']].head()


Unnamed: 0,budget,popularity,vote_average,vote_count,runtime,release_year,top_genre,log_revenue
0,237000000,150.437577,7.2,11800,162.0,2009,Action,21.748578
1,300000000,139.082615,6.9,4500,169.0,2007,Adventure,20.683485
2,245000000,107.376788,6.3,4466,148.0,2015,Action,20.596199
3,250000000,112.31295,7.6,9106,165.0,2012,Action,20.80479
4,260000000,43.926995,6.1,2124,132.0,2012,Action,19.464974


In [8]:
# Select features and split
features_reg = ['budget','popularity','vote_average','vote_count','runtime','release_year','top_genre']
X_reg = tmdb[features_reg].copy()
y_reg = tmdb['log_revenue'].copy()

numeric_reg = ['budget','popularity','vote_average','vote_count','runtime','release_year']
cat_reg = ['top_genre']
preproc_reg = ColumnTransformer([
    ('num', StandardScaler(), numeric_reg),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_reg)
])
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
print('TMDB train/test sizes:', X_train_reg.shape, X_test_reg.shape)


TMDB train/test sizes: (2583, 7) (646, 7)


## 4) Telco: Prepare classification dataset (predict churn)

In [9]:
telco = pd.read_csv('/content/telco-customer-churn.csv')
telco.shape


(7043, 21)

In [10]:
telco.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [11]:
# Quick cleaning for common Telco format
if 'Churn' in telco.columns:
    telco['Churn'] = telco['Churn'].map({'Yes':1,'No':0})
else:
    # try common alternatives
    possible = [c for c in telco.columns if 'churn' in c.lower()]
    if possible:
        telco['Churn'] = telco[possible[0]]

# Drop customerID if exists
for c in ['customerID','customer_id','id']:
    if c in telco.columns:
        telco = telco.drop(columns=[c])

# Fill simple numeric conversion
for col in telco.select_dtypes(include=['object']).columns:
    if telco[col].nunique()<=10:
        telco[col] = telco[col].fillna('Unknown')
    else:
        telco[col] = telco[col].replace(' ', np.nan)

# Drop rows with missing target
telco = telco[telco['Churn'].notna()]
telco.shape


(7043, 20)

In [12]:
# Prepare features: simple approach
# Use numeric cols and a few categorical columns
numeric_tel = telco.select_dtypes(include=[np.number]).columns.tolist()
numeric_tel = [c for c in numeric_tel if c!='Churn']
cat_tel = [c for c in telco.select_dtypes(include=['object']).columns if c!='Churn']
features_clf = numeric_tel + cat_tel
X_clf = telco[features_clf].copy()
y_clf = telco['Churn'].astype(int).copy()

from sklearn.impute import SimpleImputer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preproc_clf = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_tel),
        ('cat', categorical_transformer, cat_tel)
    ])

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)
print('Telco train/test sizes:', X_train_clf.shape, X_test_clf.shape)

Telco train/test sizes: (5634, 19) (1409, 19)


## 5) Baseline Linear Models (for comparison)

In [13]:
results = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Linear Regression on TMDB
lr_pipe = Pipeline([('pre', preproc_reg), ('model', LinearRegression())])
cv_scores = -cross_val_score(lr_pipe, X_reg, y_reg, cv=kf, scoring='neg_root_mean_squared_error')
lr_pipe.fit(X_train_reg, y_train_reg)
preds = lr_pipe.predict(X_test_reg)
rmse_test = np.sqrt(mean_squared_error(y_test_reg, preds))  # fixed here
results.append({'Task':'Regression','Model':'LinearRegression','CV_Score':cv_scores.mean(),'Test_Score':rmse_test})
print('LinearRegression RMSE (cv mean):', cv_scores.mean(), 'test RMSE:', rmse_test)

# Logistic Regression on Telco
log_pipe = Pipeline([('pre', preproc_clf), ('model', LogisticRegression(max_iter=500))])
cv_acc = cross_val_score(log_pipe, X_clf, y_clf, cv=kf, scoring='accuracy')
log_pipe.fit(X_train_clf, y_train_clf)
preds = log_pipe.predict(X_test_clf)
acc_test = accuracy_score(y_test_clf, preds)
results.append({'Task':'Classification','Model':'LogisticRegression','CV_Score':cv_acc.mean(),'Test_Score':acc_test})
print('LogisticRegression Accuracy (cv mean):', cv_acc.mean(), 'test acc:', acc_test)


LinearRegression RMSE (cv mean): 1.7032344527276013 test RMSE: 1.6071981353011466
LogisticRegression Accuracy (cv mean): 0.8057629726111362 test acc: 0.8239886444286728


## 6) Non-Linear Models + CV (DecisionTree, RandomForest, XGBoost, CatBoost)

In [14]:
from sklearn.base import clone

# Non-linear Regressors
nonlinear_regressors = {
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42, verbosity=0),
    'CatBoost': CatBoostRegressor(verbose=0, random_state=42)
}

for name, model in nonlinear_regressors.items():
    pipe = Pipeline([('pre', preproc_reg), ('model', model)])
    scores = -cross_val_score(pipe, X_reg, y_reg, cv=kf, scoring='neg_root_mean_squared_error')
    pipe.fit(X_train_reg, y_train_reg)
    preds = pipe.predict(X_test_reg)
    rmse_test = np.sqrt(mean_squared_error(y_test_reg, preds))
    results.append({'Task':'Regression','Model':name,'CV_Score':scores.mean(),'Test_Score':rmse_test})
    print(f"{name} RMSE (cv mean): {scores.mean():.4f}  test RMSE: {rmse_test:.4f}")


# Non-linear Classifiers
nonlinear_classifiers = {
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42)
}

for name, model in nonlinear_classifiers.items():
    pipe = Pipeline([('pre', preproc_clf), ('model', model)])
    scores = cross_val_score(pipe, X_clf, y_clf, cv=kf, scoring='accuracy')
    pipe.fit(X_train_clf, y_train_clf)
    preds = pipe.predict(X_test_clf)
    acc_test = accuracy_score(y_test_clf, preds)
    results.append({'Task':'Classification','Model':name,'CV_Score':scores.mean(),'Test_Score':acc_test})
    print(f"{name} Accuracy (cv mean): {scores.mean():.4f}  test acc: {acc_test:.4f}")



DecisionTree RMSE (cv mean): 1.7416  test RMSE: 1.7827
RandomForest RMSE (cv mean): 1.2711  test RMSE: 1.2808
XGBoost RMSE (cv mean): 1.3638  test RMSE: 1.3403
CatBoost RMSE (cv mean): 1.2226  test RMSE: 1.2650
DecisionTree Accuracy (cv mean): 0.7700  test acc: 0.7700
RandomForest Accuracy (cv mean): 0.7910  test acc: 0.7970
XGBoost Accuracy (cv mean): 0.7809  test acc: 0.7864
CatBoost Accuracy (cv mean): 0.8069  test acc: 0.8119


## 7) Hyperparameter Tuning (examples)
We'll tune RandomForest and XGBoost for both tasks with a small grid to keep runtime reasonable.

In [15]:
# Tune RandomForest Regressor
param_grid_rf_reg = {'model__n_estimators':[50,100], 'model__max_depth':[None,10,20]}
pipe_rf_reg = Pipeline([('pre', preproc_reg), ('model', RandomForestRegressor(random_state=42))])
grid_rf_reg = GridSearchCV(pipe_rf_reg, param_grid_rf_reg, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_rf_reg.fit(X_train_reg, y_train_reg)
best_rf_reg = grid_rf_reg.best_estimator_
preds = best_rf_reg.predict(X_test_reg)
rmse_best = np.sqrt(mean_squared_error(y_test_reg, preds))
results.append({'Task':'Regression','Model':'RandomForest (Tuned)','CV_Score':-grid_rf_reg.best_score_,'Test_Score':rmse_best})
print('RF reg best params:', grid_rf_reg.best_params_, 'test RMSE:', rmse_best)

# Tune XGBoost Classifier
param_grid_xgb_clf = {'model__n_estimators':[50,100], 'model__max_depth':[3,6], 'model__learning_rate':[0.1,0.01]}
pipe_xgb_clf = Pipeline([('pre', preproc_clf), ('model', xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))])
grid_xgb_clf = GridSearchCV(pipe_xgb_clf, param_grid_xgb_clf, cv=3, scoring='accuracy', n_jobs=-1)
grid_xgb_clf.fit(X_train_clf, y_train_clf)
best_xgb_clf = grid_xgb_clf.best_estimator_
preds = best_xgb_clf.predict(X_test_clf)
acc_best = accuracy_score(y_test_clf, preds)
results.append({'Task':'Classification','Model':'XGBoost (Tuned)','CV_Score':grid_xgb_clf.best_score_,'Test_Score':acc_best})
print('XGB clf best params:', grid_xgb_clf.best_params_, 'test acc:', acc_best)


RF reg best params: {'model__max_depth': 10, 'model__n_estimators': 100} test RMSE: 1.2783683002831336
XGB clf best params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 50} test acc: 0.8119233498935415


## 8) Results table & Save
We compile all results into a DataFrame and save `week7_results.csv`. The notebook will offer the CSV for download.

In [16]:
results_df = pd.DataFrame(results)
results_df = results_df[['Task','Model','CV_Score','Test_Score']]
results_df

results_df.to_csv('week7_results.csv', index=False)
from google.colab import files
files.download('week7_results.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>