<a href="https://colab.research.google.com/github/Offliners/HTML_2021Fall/blob/main/Final%20Project/HTML2021_Fall.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Directory**

```
data_dir
        ├── data
        │   ├── Test_IDs.csv
        │   ├── ...
        │   ├── status.csv
        ├── statistics
        │   ├── Churn Category.png
        │   ├── ...
        │   ├── Under 30.png
```

# **Display information of GPU**

In [1]:
!nvidia-smi

Fri Jan  7 08:53:27 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P8    30W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# **Download Dataset**

In [2]:
!mkdir "./data"
!mkdir "./statistics"
!gdown --id 1X5yz7QLAu4nttnCea4ALf6alae6Clv_o --output "./data/dataset.zip"
!unzip -q "./data/dataset.zip" -d "./data"
!rm "./data/dataset.zip"

Downloading...
From: https://drive.google.com/uc?id=1X5yz7QLAu4nttnCea4ALf6alae6Clv_o
To: /content/data/dataset.zip
  0% 0.00/660k [00:00<?, ?B/s]100% 660k/660k [00:00<00:00, 19.0MB/s]


# **Import Some Packages**

In [3]:
import time
import numpy as np
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# **CSV Files Combination**

In [4]:
train_path = './data/Train_IDs.csv'  # path to training data
test_path = './data/Test_IDs.csv'    # path to testing data

files = glob('./data/*.csv')
data_csv = []
data_csv.append(train_path)
for csv in files:
    if ('IDs' not in csv) and ('sample' not in csv) and ('population' not in csv) and ('result' not in csv):
        data_csv.append(csv)
  
print(data_csv)
df_list = [pd.read_csv(file) for file in data_csv]

['./data/Train_IDs.csv', './data/demographics.csv', './data/services.csv', './data/status.csv', './data/satisfaction.csv', './data/location.csv']


In [5]:
print(df_list[0]) # Train IDs

     Customer ID
0     0650-BWOZN
1     0562-FGDCR
2     6688-UZPWD
3     2905-KFQUV
4     9720-JJJOR
...          ...
5629  1178-PZGAB
5630  4806-KEXQR
5631  8809-RIHDD
5632  6663-JOCQO
5633  7010-ZMVBF

[5634 rows x 1 columns]


In [6]:
result = df_list[0]
for df in df_list[1:]:
    result = pd.merge(result, df, how='outer', on='Customer ID')

result.to_csv('./data/result.csv') # Save combined result to result.csv

# There are 7043 total customer data
# Train_IDs has 5634 customer data
# Test_Ids has 1409 customer data
print(result)

     Customer ID  Count_x  ...   Latitude   Longitude
0     0650-BWOZN      1.0  ...        NaN         NaN
1     0562-FGDCR      1.0  ...  34.903052 -118.411251
2     6688-UZPWD      1.0  ...  33.721917 -118.043237
3     2905-KFQUV      1.0  ...        NaN -122.000887
4     9720-JJJOR      1.0  ...  39.672813 -120.456699
...          ...      ...  ...        ...         ...
7038  3836-FZSDJ      NaN  ...        NaN         NaN
7039  6122-EFVKN      NaN  ...  37.556634 -122.317723
7040  9430-NKQLY      NaN  ...        NaN -122.752839
7041  8231-BSWXX      NaN  ...  34.097863 -116.594561
7042  4482-EWFMI      NaN  ...        NaN -120.132870

[7043 rows x 48 columns]


In [7]:
print(result.dropna()) # Find customer with full data info

     Customer ID  Count_x  ...   Latitude   Longitude
666   0454-OKRCT      1.0  ...  38.425280 -119.475741
678   1735-XMJVH      1.0  ...  38.809175 -121.171375
1799  1245-HARPS      1.0  ...  40.587919 -122.464732
2805  8445-DNBAE      1.0  ...  41.212695 -122.392067
2883  8708-XPXHZ      1.0  ...  40.342928 -124.063329
3062  9522-ZSINC      1.0  ...  34.128284 -118.047732
4297  0836-SEYLU      1.0  ...  36.414611 -121.638600
5146  7274-RTAPZ      1.0  ...  36.657462 -119.595293

[8 rows x 48 columns]


In [8]:
result_cols = result.columns
print(result_cols)

Index(['Customer ID', 'Count_x', 'Gender', 'Age', 'Under 30', 'Senior Citizen',
       'Married', 'Dependents', 'Number of Dependents', 'Count_y', 'Quarter',
       'Referred a Friend', 'Number of Referrals', 'Tenure in Months', 'Offer',
       'Phone Service', 'Avg Monthly Long Distance Charges', 'Multiple Lines',
       'Internet Service', 'Internet Type', 'Avg Monthly GB Download',
       'Online Security', 'Online Backup', 'Device Protection Plan',
       'Premium Tech Support', 'Streaming TV', 'Streaming Movies',
       'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing',
       'Payment Method', 'Monthly Charge', 'Total Charges', 'Total Refunds',
       'Total Extra Data Charges', 'Total Long Distance Charges',
       'Total Revenue', 'Churn Category', 'Satisfaction Score', 'Count',
       'Country', 'State', 'City', 'Zip Code', 'Lat Long', 'Latitude',
       'Longitude'],
      dtype='object')


# **Statistics and Data Preprocessing**

In [9]:
result_copy = result.copy(deep=True)
def label_statistics(label):
    freq = result.groupby(label).size() 
    # print(freq)

    names = [name for name, _ in freq.items()]
    counts = [count for _, count in freq.items()]

    x = np.arange(len(names))
    plt.bar(x, counts)
    plt.xticks(x, names, rotation=15)
    plt.title(label)
    plt.savefig(f'./statistics/{label}.png')
    print(f'{label} saved')
    plt.close()

    # Replace NaN with the most frequent label
    result_copy[label] = result_copy[label].fillna(names[counts.index(max(counts))])

In [10]:
need_statistics_col = ['Churn Category', 'Satisfaction Score', 
       'Gender', 'Under 30', 'Senior Citizen', 'Married', 'Dependents',
       'Number of Dependents', 'Country', 'State', 'City', 'Quarter',
       'Referred a Friend', 'Number of Referrals', 'Offer',
       'Phone Service', 'Multiple Lines', 'Internet Service', 'Internet Type',
       'Online Security', 'Online Backup', 'Device Protection Plan',
       'Premium Tech Support', 'Streaming TV', 'Streaming Movies',
       'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing',
       'Payment Method']

# Replace NaN with the most frequent label
for need_col in need_statistics_col:
    label_statistics(need_col)

    # Encode target labels with value
    le = LabelEncoder()
    result_copy[need_col] = le.fit_transform(result_copy[need_col])

    if need_col == 'Churn Category':
        encoder_map = dict(zip(le.classes_, le.transform(le.classes_)))

Churn Category saved
Satisfaction Score saved
Gender saved
Under 30 saved
Senior Citizen saved
Married saved
Dependents saved
Number of Dependents saved
Country saved
State saved
City saved
Quarter saved
Referred a Friend saved
Number of Referrals saved
Offer saved
Phone Service saved
Multiple Lines saved
Internet Service saved
Internet Type saved
Online Security saved
Online Backup saved
Device Protection Plan saved
Premium Tech Support saved
Streaming TV saved
Streaming Movies saved
Streaming Music saved
Unlimited Data saved
Contract saved
Paperless Billing saved
Payment Method saved


In [11]:
need_avg_col = [item for item in result_cols if item not in need_statistics_col]

# Replace NaN with average value or 0
for avg_col in need_avg_col[1:]:
    if avg_col == 'Zip Code' or avg_col == 'Lat Long':
        continue
    elif 'Count' in avg_col:
        result_copy[avg_col] = result_copy[avg_col].fillna(0)
    else:
        result_copy[avg_col] = result_copy[avg_col].fillna(result_copy[avg_col].mean())

# Throw away the data columns I think is useless
result_copy = result_copy.drop('Count_x', axis=1)
result_copy = result_copy.drop('Count_y', axis=1)
result_copy = result_copy.drop('Country', axis=1)
result_copy = result_copy.drop('State', axis=1)
result_copy = result_copy.drop('City', axis=1)
result_copy = result_copy.drop('Zip Code', axis=1)
result_copy = result_copy.drop('Lat Long', axis=1)
result_copy = result_copy.drop('Latitude', axis=1)
result_copy = result_copy.drop('Longitude', axis=1)
result_copy = result_copy.drop('Count', axis=1)
result_copy = result_copy.drop('Quarter', axis=1)

In [12]:
result_copy.to_csv('./data/result_after_preprocessing.csv') # Save after preprocessing result to result_after_preprocessing.csv

print(result_copy)

     Customer ID  Gender  ...  Churn Category  Satisfaction Score
0     0650-BWOZN       1  ...               1                   0
1     0562-FGDCR       1  ...               1                   0
2     6688-UZPWD       1  ...               3                   2
3     2905-KFQUV       1  ...               1                   2
4     9720-JJJOR       0  ...               3                   2
...          ...     ...  ...             ...                 ...
7038  3836-FZSDJ       1  ...               3                   4
7039  6122-EFVKN       1  ...               3                   2
7040  9430-NKQLY       1  ...               3                   2
7041  8231-BSWXX       1  ...               3                   2
7042  4482-EWFMI       1  ...               3                   2

[7043 rows x 37 columns]


# **Train**

In [13]:
sc = StandardScaler()

y_train = result_copy['Churn Category'].values
X_train = result_copy.drop('Customer ID', axis=1)
X_train = X_train.drop('Churn Category', axis=1)

ss = StandardScaler().fit(X_train)
X_train_std = ss.transform(X_train)
mms = MinMaxScaler(feature_range=(0, 1)).fit(X_train_std)
X_train_std = mms.transform(X_train_std)

In [14]:
svm = SVC(kernel='rbf',  gamma=0.7, C=10)

start_time = time.time()
svm.fit(X_train, y_train)
end_time = time.time()

print(f'Training use {round(end_time - start_time, 3)}s')

Training use 13.228s


In [15]:
y_train_predict = svm.predict(X_train_std)
Ein = np.mean(np.array(y_train_predict) != y_train)
print(f'Ein = {round(Ein, 5)}')

  "X does not have valid feature names, but"


Ein = 0.15732


# **Test**

In [16]:
df_test = pd.read_csv(test_path)

test_result = pd.merge(df_test, result_copy, how='inner', on='Customer ID')

X_test = test_result.drop('Customer ID', axis=1)
X_test = X_test.drop('Churn Category', axis=1)
ss = StandardScaler().fit(X_test)
X_test_std = ss.transform(X_test)
mms = MinMaxScaler(feature_range=(0, 1)).fit(X_test_std)
X_test = mms.transform(X_test_std)
y_test = svm.predict(X_test)

  "X does not have valid feature names, but"


In [17]:
submit_result = './result.csv'

new_encoder_map = {'No Churn':0, 'Competitor': 1, 'Dissatisfaction':2, 'Attitude': 3, 'Price':4, 'Other':5}

with open(submit_result, 'w') as f:
    f.write('Customer ID,Churn Category\n')
    for i in range(len(df_test.values)):
        id = str(df_test.values[i]).replace('[\'', '')
        id = id.replace('\']', '')
        pred = new_encoder_map.get(list(encoder_map.keys())[list(encoder_map.values()).index(y_test[i])])
        f.write(f'{id},{pred}\n')

In [18]:
from google.colab import files

files.download(submit_result)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>