In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
df = pd.read_csv('data_regression.csv')

In [31]:
df.head()

Unnamed: 0,year,customer_id,phone_no,gender,age,no_of_days_subscribed,multi_screen,mail_subscribed,weekly_mins_watched,minimum_daily_mins,maximum_daily_mins,weekly_max_night_mins,videos_watched,maximum_days_inactive,customer_support_calls,churn
0,2015,100198,409-8743,Female,36,62,no,no,148.35,12.2,16.81,82,1,4.0,1,0.0
1,2015,100643,340-5930,Female,39,149,no,no,294.45,7.7,33.37,87,3,3.0,2,0.0
2,2015,100756,372-3750,Female,65,126,no,no,87.3,11.9,9.89,91,1,4.0,5,1.0
3,2015,101595,331-4902,Female,24,131,no,yes,321.3,9.5,36.41,102,4,3.0,3,0.0
4,2015,101653,351-8398,Female,40,191,no,no,243.0,10.9,27.54,83,7,3.0,1,0.0


In [32]:
df.shape

(2000, 16)

In [33]:
df.isnull().mean()

year                      0.0000
customer_id               0.0000
phone_no                  0.0000
gender                    0.0120
age                       0.0000
no_of_days_subscribed     0.0000
multi_screen              0.0000
mail_subscribed           0.0000
weekly_mins_watched       0.0000
minimum_daily_mins        0.0000
maximum_daily_mins        0.0000
weekly_max_night_mins     0.0000
videos_watched            0.0000
maximum_days_inactive     0.0140
customer_support_calls    0.0000
churn                     0.0175
dtype: float64

**missing values are less the 1% so we drop the missing values rows**

In [34]:
df.dropna(inplace=True)

In [35]:
df.reset_index(inplace=True)

In [36]:
df.isnull().sum()

index                     0
year                      0
customer_id               0
phone_no                  0
gender                    0
age                       0
no_of_days_subscribed     0
multi_screen              0
mail_subscribed           0
weekly_mins_watched       0
minimum_daily_mins        0
maximum_daily_mins        0
weekly_max_night_mins     0
videos_watched            0
maximum_days_inactive     0
customer_support_calls    0
churn                     0
dtype: int64

**All the missing values are droped**

In [37]:
df.dtypes

index                       int64
year                        int64
customer_id                 int64
phone_no                   object
gender                     object
age                         int64
no_of_days_subscribed       int64
multi_screen               object
mail_subscribed            object
weekly_mins_watched       float64
minimum_daily_mins        float64
maximum_daily_mins        float64
weekly_max_night_mins       int64
videos_watched              int64
maximum_days_inactive     float64
customer_support_calls      int64
churn                     float64
dtype: object

In [38]:
df['churn']=df['churn'].astype(int)

In [39]:
df.head(2)

Unnamed: 0,index,year,customer_id,phone_no,gender,age,no_of_days_subscribed,multi_screen,mail_subscribed,weekly_mins_watched,minimum_daily_mins,maximum_daily_mins,weekly_max_night_mins,videos_watched,maximum_days_inactive,customer_support_calls,churn
0,0,2015,100198,409-8743,Female,36,62,no,no,148.35,12.2,16.81,82,1,4.0,1,0
1,1,2015,100643,340-5930,Female,39,149,no,no,294.45,7.7,33.37,87,3,3.0,2,0


In [40]:
df.shape

(1918, 17)

**Drop Unwanted Columns**

In [41]:
df = df.drop(columns=['index','year', 'customer_id', 'phone_no'])

**Convert Categorical Columns to Numerical Columns**

In [42]:
from sklearn.preprocessing import LabelEncoder

In [43]:
cat_cols = df.select_dtypes(include='object').columns.to_list()

df[cat_cols] = df[cat_cols].apply(LabelEncoder().fit_transform)

In [44]:
df.head(2)

Unnamed: 0,gender,age,no_of_days_subscribed,multi_screen,mail_subscribed,weekly_mins_watched,minimum_daily_mins,maximum_daily_mins,weekly_max_night_mins,videos_watched,maximum_days_inactive,customer_support_calls,churn
0,0,36,62,0,0,148.35,12.2,16.81,82,1,4.0,1,0
1,0,39,149,0,0,294.45,7.7,33.37,87,3,3.0,2,0


In [45]:
# df.to_csv('churn_labeled_data.csv', index=False)

**Standarized the data**

In [46]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [47]:
# columns_to_standardize = df.columns[:-1]

# scaler = StandardScaler()

# scaler.fit(df[columns_to_standardize])

# # df[columns_to_standardize] = df[columns_to_standardize].apply(scaler.fit_transform)
# """The issue with this code is that StandardScaler() should be fitted only once to the entire dataset, 
# and then used to transform each column individually. However, 
# in the provided code, scaler.fit_transform() is being applied to each column separately within the apply() function. 
# This means that the scaler is being refit and transformed for each column, which is incorrect."""

# """To fix this issue, you should fit the scaler to the entire dataset first, and then transform each column using the fitted scaler"""

# df[columns_to_standardize] = scaler.transform(df[columns_to_standardize])


In [None]:
# df.to_csv('Churn_Standardize_data.csv', index=False)

**Normalize the Data**

In [50]:
columns_to_normalize = df.columns[:-1]

normal = MinMaxScaler()

normal.fit(df[columns_to_normalize])

df[columns_to_normalize] = normal.transform(df[columns_to_normalize])

In [51]:
df.head(2)

Unnamed: 0,gender,age,no_of_days_subscribed,multi_screen,mail_subscribed,weekly_mins_watched,minimum_daily_mins,maximum_daily_mins,weekly_max_night_mins,videos_watched,maximum_days_inactive,customer_support_calls,churn
0,0.0,0.28125,0.252066,0.0,0.0,0.281927,0.61,0.281858,0.300752,0.052632,0.666667,0.111111,0
1,0.0,0.328125,0.61157,0.0,0.0,0.559578,0.385,0.559524,0.338346,0.157895,0.5,0.222222,0


In [52]:
df.to_csv('Churn_Normalize_data.csv', index=False)