In [133]:
import pandas as pd
import sklearn.preprocessing as pp

In [134]:
chum_csv = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = pd.read_csv(chum_csv)

In [135]:
# remove customerID
df2 = df.drop(columns=["customerID"])
# replace the "no-internet-service" and "no-phone-service" with NaN
df2.replace(["No phone service", "No internet service", " "],[pd.NA, pd.NA, pd.NA], inplace=True)
print(df2.columns)

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')


In [136]:
# split columns like gender and internet service so that we can test the corelation, 
# and replace Yes wiht 1 and no with 0

df2 = pd.get_dummies(df2, columns=["gender", "InternetService", "Contract", "PaymentMethod"])

df2.replace(["Yes", "No"], [1, 0], inplace=True)

print(df2.columns)

Index(['SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService',
       'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling',
       'MonthlyCharges', 'TotalCharges', 'Churn', 'gender_Female',
       'gender_Male', 'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')


In [137]:
# normalize the data and convert str to float where needed
df2["TotalCharges"] = df2["TotalCharges"].dropna().astype(float)

normalized_df = df2
# removes each row with na value
normalized_df.dropna(inplace=True)
# print(normalized_df2["InternetService_No"])
# normalizes the values of each column and replaces the values 
# of old column with values of the new column
for column in normalized_df:
    values = normalized_df.loc[:, column]
    max_value = max(values)
    if column == "InternetService_No":
        continue
    values = [value/max_value for value in values]
    normalized_df.loc[:, column] = values

print(normalized_df.columns)

Index(['SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService',
       'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling',
       'MonthlyCharges', 'TotalCharges', 'Churn', 'gender_Female',
       'gender_Male', 'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')


  normalized_df.loc[:, column] = values


In [138]:
# find the corelation between every column not Churn and Churn
corr = normalized_df.corr()
churn_corr= corr[['Churn']].fillna(0)
# invert the sign of negative numbers
new_churn_corr = []
for x in churn_corr["Churn"]:
    if x < 0:
        new_churn_corr.append(x*-1)
    else:
        new_churn_corr.append(x)

churn_corr['Churn'] = new_churn_corr
churn_corr = churn_corr.sort_values("Churn")
        
print(churn_corr)

                                            Churn
InternetService_No                       0.000000
PhoneService                             0.000000
gender_Male                              0.011129
gender_Female                            0.011129
MonthlyCharges                           0.018666
MultipleLines                            0.019693
PaymentMethod_Mailed check               0.037398
StreamingTV                              0.039060
StreamingMovies                          0.046348
SeniorCitizen                            0.113042
PaperlessBilling                         0.137967
Dependents                               0.138511
PaymentMethod_Bank transfer (automatic)  0.139725
PaymentMethod_Credit card (automatic)    0.146398
Partner                                  0.169889
DeviceProtection                         0.176081
Contract_One year                        0.191249
OnlineBackup                             0.204154
InternetService_Fiber optic              0.257947


In [139]:
# we find the columns to remove by selecting everything but the last 10
to_be_removed = churn_corr[:-3].index

# make a new list with no duplication
print(to_be_removed)

Index(['InternetService_No', 'PhoneService', 'gender_Male', 'gender_Female',
       'MonthlyCharges', 'MultipleLines', 'PaymentMethod_Mailed check',
       'StreamingTV', 'StreamingMovies', 'SeniorCitizen', 'PaperlessBilling',
       'Dependents', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)', 'Partner', 'DeviceProtection',
       'Contract_One year', 'OnlineBackup', 'InternetService_Fiber optic',
       'InternetService_DSL', 'PaymentMethod_Electronic check', 'TechSupport',
       'OnlineSecurity', 'Contract_Two year', 'TotalCharges'],
      dtype='object')


In [140]:
# partually prossesed data frame with the least relevant columns removed

df4 = df2.drop(columns=to_be_removed)

df4 = df4.dropna()

print(df4.columns)

Index(['tenure', 'Churn', 'Contract_Month-to-month'], dtype='object')


In [141]:
from sklearn.model_selection import train_test_split
# training target
y = df4["Churn"]
X = df4.drop(columns="Churn")

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)

In [142]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X_train, y_train)

predicted_churn = neigh.predict(X_test)

In [143]:
from sklearn.metrics import accuracy_score, confusion_matrix

print(accuracy_score(y_test, predicted_churn))
print(confusion_matrix(y_test, predicted_churn))

0.6192052980132451
[[651 156]
 [304  97]]


In [153]:
from pandas import DataFrame
def find_accuracy(n_columns_to_keep, n_neigh, dataframe: DataFrame):
    to_be_removed_func = churn_corr[:-n_columns_to_keep].index
    
    df4_func = dataframe.drop(columns=to_be_removed_func)
    df4_func = df4_func.dropna()

    y_func = df4_func["Churn"]
    X_func = df4_func.drop(columns="Churn")

    X_func_train, X_func_test, y_func_train, y_func_test = train_test_split(X_func, y_func, random_state=5)
    

    neigh_n = KNeighborsClassifier(n_neighbors=n_neigh)

    neigh_n.fit(X_func_train, y_func_train)

    predicted_func_churn = neigh_n.predict(X_func_test)

    return accuracy_score(y_func_test, predicted_func_churn)

In [162]:
accuracy_results = []

for x in range(2, 29):
    for n in range(1, 100):
        accuracy = find_accuracy(x, n, df2)
        accuracy_results.append([x, n, accuracy])

In [172]:
from seaborn import heatmap
accuracy_df = DataFrame(accuracy_results, columns=["number of columns", "number of neighbours", "Accuracy"])

print(accuracy_df.max())

number of columns       28.000000
number of neighbours    99.000000
Accuracy                 0.779801
dtype: float64
