In [25]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

#Import data
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

#Remove unnecessary columns (customerID is an identifier)
df.drop(['customerID'], axis=1, inplace=True)

#Group features

#Online Security, Online Backup, Device Protection, Tech Support
def combined_securitysupport(row):
    if row['InternetService'] == 'No':
        return 'No internet service'
    
    count_yes = sum([
        row['OnlineSecurity'] == 'Yes',
        row['OnlineBackup'] == 'Yes',
        row['DeviceProtection'] == 'Yes',
        row['TechSupport'] == 'Yes'
    ])
    
    if count_yes == 0:
        return 'None'
    elif count_yes == 4:
        return 'All'
    else:
        return 'Some'

#Streaming TV, Streaming Movies
def combined_entertainment(row):
    if row['InternetService'] == 'No':
        return 'No internet service'
    
    count_yes = sum([
        row['StreamingTV'] == 'Yes',
        row['StreamingMovies'] == 'Yes'
    ])
    
    if count_yes == 0:
        return 'None'
    elif count_yes == 2:
        return 'Both'
    else:
        return 'One'

#Multiple Lines, Phone Service, or No phone service
def combined_lineservice(row):
    if row['PhoneService'] == 'No':
        return 'No phone service'
    elif row['MultipleLines'] == 'Yes':
        return 'Multiple lines'
    else:
        return 'Single line'
    
#One-hot/Ordinal encode certain features

df['PhoneLineStatus'] = df.apply(combined_lineservice, axis=1)

df['SecuritySupportLevel'] = df.apply(combined_securitysupport, axis=1)

df['EntertainmentLevel'] = df.apply(combined_entertainment, axis=1)

df.drop(['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'MultipleLines'], axis=1, inplace=True)

df['Contract'] = df['Contract'].map({'Month-to-month': 0, 'One year': 1, 'Two year': 2})

df['Partner'] = df['Partner'].map({'Yes': 1, 'No': 0})

df['Dependents'] = df['Dependents'].map({'Yes': 1, 'No': 0})

df['PhoneService'] = df['PhoneService'].map({'Yes': 1, 'No': 0})

df['PaperlessBilling'] = df['PaperlessBilling'].map({'Yes': 1, 'No': 0})

df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

df['InternetService'] = df['InternetService'].map({'DSL': 1, 'Fiber optic': 2, 'No': 0})

encoder = OneHotEncoder(sparse_output=False, drop=None)

encoded = encoder.fit_transform(df[['PhoneLineStatus']])

encoded_df = pd.DataFrame( encoded, columns=encoder.get_feature_names_out(['PhoneLineStatus']))


# Combine with original DataFrame
df = pd.concat([df.drop('PhoneLineStatus', axis=1), encoded_df], axis=1)

df['SecuritySupportLevel'] = df['SecuritySupportLevel'].map({
    'No internet service': 0,
    'None': 1, 
    'Some': 2,
    'All': 3
})

df['EntertainmentLevel'] = df['EntertainmentLevel'].map({
    'No internet service': 0,
    'None': 1,
    'One': 2,
    'Both': 3
})


#TODO: Independent and Dependent variables
#TODO: Train-test split
#TODO: Model training on training set
#TODO: Model evaluation on test set
#TODO: Visualizations
