In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [15]:
data = pd.read_csv("churn_clean.csv")

In [16]:
columns = ["State", "Area", "Marital", "Gender","InternetService","Contract", "Age", "Income","Children", "Outage_sec_perweek", "Email", "Contacts",  "MonthlyCharge", "Bandwidth_GB_Year", "Tenure"]

In [17]:
cat_var = ["State", "Area", "Marital", "Gender","InternetService","Contract"]

In [18]:
updated_data = data[columns]

In [19]:
updated_data.duplicated().sum()

0

In [20]:
updated_data.isna().sum()

State                    0
Area                     0
Marital                  0
Gender                   0
InternetService       2129
Contract                 0
Age                      0
Income                   0
Children                 0
Outage_sec_perweek       0
Email                    0
Contacts                 0
MonthlyCharge            0
Bandwidth_GB_Year        0
Tenure                   0
dtype: int64

In [21]:
updated_data = updated_data.fillna("None")
updated_data.isna().sum()

State                 0
Area                  0
Marital               0
Gender                0
InternetService       0
Contract              0
Age                   0
Income                0
Children              0
Outage_sec_perweek    0
Email                 0
Contacts              0
MonthlyCharge         0
Bandwidth_GB_Year     0
Tenure                0
dtype: int64

In [22]:
updated_data.describe()

Unnamed: 0,Age,Income,Children,Outage_sec_perweek,Email,Contacts,MonthlyCharge,Bandwidth_GB_Year,Tenure
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,53.0784,39806.926771,2.0877,10.001848,12.016,0.9942,172.624816,3392.34155,34.526188
std,20.698882,28199.916702,2.1472,2.976019,3.025898,0.988466,42.943094,2185.294852,26.443063
min,18.0,348.67,0.0,0.099747,1.0,0.0,79.97886,155.506715,1.000259
25%,35.0,19224.7175,0.0,8.018214,10.0,0.0,139.979239,1236.470827,7.917694
50%,53.0,33170.605,1.0,10.01856,12.0,1.0,167.4847,3279.536903,35.430507
75%,71.0,53246.17,3.0,11.969485,14.0,2.0,200.734725,5586.14137,61.479795
max,89.0,258900.7,10.0,21.20723,23.0,7.0,290.160419,7158.98153,71.99928


In [23]:
updated_data[cat_var].describe()

Unnamed: 0,State,Area,Marital,Gender,InternetService,Contract
count,10000,10000,10000,10000,10000,10000
unique,52,3,5,3,3,3
top,TX,Suburban,Divorced,Female,Fiber Optic,Month-to-month
freq,603,3346,2092,5025,4408,5456


In [24]:
updated_data = pd.get_dummies(updated_data, columns= cat_var, dtype=int)
updated_data

Unnamed: 0,Age,Income,Children,Outage_sec_perweek,Email,Contacts,MonthlyCharge,Bandwidth_GB_Year,Tenure,State_AK,...,Marital_Widowed,Gender_Female,Gender_Male,Gender_Nonbinary,InternetService_DSL,InternetService_Fiber Optic,InternetService_None,Contract_Month-to-month,Contract_One year,Contract_Two Year
0,68,28561.99,0,7.978323,10,0,172.455519,904.536110,6.795513,1,...,1,0,1,0,0,1,0,0,1,0
1,27,21704.77,1,11.699080,12,0,242.632554,800.982766,1.156681,0,...,0,1,0,0,0,1,0,1,0,0
2,50,9609.57,4,10.752800,9,0,159.947583,2054.706961,15.754144,0,...,1,1,0,0,1,0,0,0,0,1
3,48,18925.23,1,14.913540,15,2,119.956840,2164.579412,17.087227,0,...,0,0,1,0,1,0,0,0,0,1
4,83,40074.19,0,8.147417,16,2,149.948316,271.493436,1.670972,0,...,0,0,1,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,23,55723.74,3,9.415935,12,2,159.979400,6511.252601,68.197130,0,...,0,0,1,0,1,0,0,1,0,0
9996,48,34129.34,4,6.740547,15,2,207.481100,5695.951810,61.040370,0,...,0,0,1,0,0,1,0,0,0,1
9997,48,45983.43,1,6.590911,10,0,169.974100,4159.305799,47.416890,0,...,0,1,0,0,0,1,0,1,0,0
9998,39,16667.58,1,12.071910,14,1,252.624000,6468.456752,71.095600,0,...,0,0,1,0,0,1,0,0,0,1


In [25]:
#To CSV
updated_data.to_csv("updated_data.csv")

In [26]:
y = updated_data["Tenure"]
X = updated_data.drop(columns= "Tenure")

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 42)

In [28]:
#To CSV
X_train.to_csv("X_train.csv")
X_test.to_csv("X_test.csv")
y_train.to_csv("y_train.csv")
y_test.to_csv("y_test.csv")

In [29]:
rf = RandomForestRegressor().fit(X_train, y_train)

In [30]:
y_pred = rf.predict(X_test)
y_pred_train = rf.predict(X_train)

In [31]:
mae1 = mean_absolute_error(y_test, y_pred)
mse1 = mean_squared_error(y_test, y_pred)
rmse1 = np.sqrt(mse1)
r_squared1 = r2_score(y_test, y_pred)

mae2 = mean_absolute_error(y_train, y_pred_train)
mse2 = mean_squared_error(y_train, y_pred_train)
rmse2 = np.sqrt(mse2)
r_squared2 = r2_score(y_train, y_pred_train)

In [32]:
print(f"MAE Train: {mae2} Test: {mae1}")
print(f"MSE Train: {mse2}  Test: {mse1}")
print(f"RMSE Train: {rmse2}  Test: {rmse1}")
print(f"R_quared Train: {r_squared2}  Test: {r_squared1}")

MAE Train: 0.3496807968215396 Test: 0.9052343245654966
MSE Train: 0.2015941595084817  Test: 1.3519193839922272
RMSE Train: 0.44899238246153095  Test: 1.162720681845914
R_quared Train: 0.9997120796748322  Test: 0.9980587605763507
