In [None]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

import scipy.stats as stats

In [None]:
# Load data into dataframe
file = r"C:\Users\woote\Desktop\WGU MSDA\[05] D209\churn_clean.csv"
df = pd.read_csv(file)
pd.set_option('display.max_columns', None)
df.head()
#(Python, 2022)

In [None]:
# Remove variables that are not being used to answer question
reducedDf = df.drop(['CaseOrder','Customer_id','Interaction','UID','City'
                        ,'State','County','Zip','Lat','Lng'
                        ,'Population','Area','TimeZone','Job','Marital'
                        ,'Port_modem','Tablet','InternetService','Phone','Multiple'
                        ,'OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV'
                        ,'PaperlessBilling','PaymentMethod','Bandwidth_GB_Year','Contacts','Email'
                        ,'StreamingMovies','Age','Outage_sec_perweek','Yearly_equip_failure'
                        ,'Income','Children'], axis=1)
reducedDf.head()

# Detect/Remove Missing Values in Dataset

In [None]:
# Find the Number of Duplicate Values in Each Column
for column_name in reducedDf.columns:
    print(column_name)
    print(reducedDf[column_name].duplicated().sum())
    print("==================================")

# Detect/Remove NULL Values in Dataset

In [None]:
# Detect NULL Values in Dataset
print(reducedDf.isnull().sum())

# Detect/Remove Outliers in Dataset

In [None]:
# Analyzing qualitative data for innapropriate data points / Outliers
qualOutliers = ['Gender','Churn','Techie','Contract','Item1'
                ,'Item2','Item3','Item4','Item5','Item6'
                ,'Item7','Item8']

for columnName in qualOutliers:
        print(columnName.upper())                                                       # Make upper case for readability
        print(str(reducedDf[columnName].unique()) + " , Number Unique Items: " + str(df[columnName].nunique()))
        print("===============================")                                        # Divide output for readability

In [None]:
# Analyzing quantitative data for outliers
# When using z-score values +-3 are considered outliers
quantOutliers = reducedDf.columns
zscore_df = pd.DataFrame()

quantOutliers = [column for column in quantOutliers if column not in qualOutliers]
print(quantOutliers)

for column in quantOutliers:
    zscore_df[f'Z Score {column}'] = stats.zscore(reducedDf[column])

zscore_df.hist()
plt.tight_layout(pad = .5)

# Perform Encoding

In [None]:
# Perform Nominal Encoding
nominalYN = {
    "Yes": 1
    ,"No": 0
}
reducedDf['Techie'].replace(nominalYN, inplace=True)
reducedDf['Contract'].replace(nominalYN, inplace=True)
reducedDf['Churn'].replace(nominalYN, inplace=True)

# Perform One-Hot Encoding on Gender Column
encoding = pd.get_dummies(data=reducedDf.Gender, dtype=float)
reducedDf = reducedDf.drop(['Gender'],axis=1)
reducedDf = reducedDf.join(encoding)


# Perform One-Hot Encoding on Contract Column
encoding = pd.get_dummies(data=reducedDf.Contract, dtype=float)
reducedDf = reducedDf.drop(['Contract'],axis=1)
reducedDf = reducedDf.join(encoding)
reducedDf.head()

# Rename Columns

In [None]:
reducedDf.rename(columns={'Item1': 'TimeResponse', 'Item2': 'TimeFix'
                             , 'Item3': 'TimeReplace', 'Item4': 'Reliability'
                             , 'Item5': 'Options', 'Item6': 'RespectResponse'
                             , 'Item7': 'CourtExch', 'Item8': 'ActiveList'
                             , 'Month-to-month': 'MTM', 'One year':'One_year'
                             , 'Two Year':'Two_year'}, inplace=True)
reducedDf.head()

## Dropping More Variables

In [None]:
reducedDf = reducedDf.drop(['Female','MTM', 'ActiveList', 'RespectResponse','CourtExch'
                           ,'TimeFix','Nonbinary','TimeReplace','Reliability','TimeResponse'
                           ,'Options'],axis=1)
reducedDf.head()

## Correlation Analysis

In [None]:
# Check correlation coefficient of all variables in the data set. 
# Make sure no variables have a high level of correlation.
stats_df = reducedDf
corr = stats_df.corr()
sns.set(rc={"figure.figsize":(17, 10)})
sns.heatmap(corr, cmap="Blues", annot=True)
# (Bhandari, 2023)

## Create Train and Test Split

In [None]:
# Predictor variables
X = reducedDf[['Techie','Tenure','MonthlyCharge','Male','One_year', 'Two_year']]
# Response variables
y = reducedDf['Churn']
# Create test and training set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
# (Geeks for Geeks, 2022)

In [None]:
# Observe output of Training dataset
print(X_train)
print("Number of rows in training set: " + str(len(X_train)))

In [None]:
# Observe output of Test dataset
print(X_test)
print("Number of rows in test set: " + str(len(X_test)))

## Output Training and Test Dataset

In [None]:
# Output of training and test datasets
X_train.to_csv('D209_training.csv')
X_test.to_csv('D209_test.csv')

## Perform KNN Analysis (Pre Scaling)

In [None]:
knn = KNeighborsClassifier(n_neighbors = 5)
knn_model_1 = knn.fit(X_train, y_train)
knn_model_2 = knn.fit(X_test,y_test)
print('k-NN accuracy score for training set: %f' % knn_model_2.score(X_test, y_test))
print('k-NN accuracy score for test set: %f' % knn_model_1.score(X_train, y_train))
#(Bowne-Anderson, 2016) (Geeks for Geeks, 2023) (Wikipedia Contributors, 2023)

## Peform KNN Analysis (Post Scaling)

In [None]:
# Test/Training sets are created along with scaling and creating of KNN model
x_scale = scale(X)
x_scale_train, x_scale_test, y_scale_train, y_scale_test = train_test_split(x_scale, y, test_size=.2)
knn_model_3 = knn.fit(x_scale_train, y_scale_train)
knn_model_4 =  knn.fit(x_scale_test, y_scale_test)
print('k-NN accuracy score for scaled test set: %f' % knn_model_4.score(x_scale_test, y_scale_test))
print('k-NN accuracy score for scaled train set: %f' % knn_model_3.score(x_scale_train, y_scale_train))
# (Parthak, 2014) (Geeks for Geeks, 2023) (Wikipedia Contributors, 2023)

In [None]:
# Observe scaled test data
print(x_scale_train)
print("Number of rows in x_scale_train : " + str(len(x_scale_train)))

In [None]:
# Observe scaled test data
print(x_scale_test)
print("Number of rows in x_scale_test: " + str(len(x_scale_test)))

## Calculating Area Under Curve (AUC)

In [None]:
# Calculating AUC
y_pred_prob = knn.predict_proba(x_scale_test)[::,1]
auc = metrics.roc_auc_score(y_scale_test,y_pred_prob)
print(auc)
# (Zach, 2021)

### Interpretting AUC Scores
* 0.5 = No discrimination
* 0.5-0.7 = Poor discrimination
* 0.7-0.8 = Acceptable discrimination
* 0.8-0.9= Excellent discrimination
* 0.9-1.0 = Outstanding discrimination

## References

1. https://www.geeksforgeeks.org/k-nearest-neighbours/ (K Nearest Neighbor)
2. https://www.scribbr.com/methodology/independent-and-dependent-variables/#vs (Determine if variable is independent or dependent)
3. https://www.datacamp.com/tutorial/preprocessing-in-data-science-part-1-centering-scaling-and-knn (KNN Algorithm Explanation)
4. https://www.askpython.com/python/examples/data-scaling-in-python (Standardization and Normilazation of Data)
5. https://www.geeksforgeeks.org/how-to-do-train-test-split-using-sklearn-in-python/ (Making Train and Test Set)
6. https://realpython.com/knn-python/#a-step-by-step-knn-from-scratch-in-python (Thorough Explanation of KNN)
7. https://www.statology.org/auc-in-python/ (Calculating and Understanding Area Under Curve)
8. https://en.wikipedia.org/wiki/Accuracy_and_precision (Calculating accuracy in KNN model)