In [None]:
# Link for json api:
# https://api.csvgetter.com/demo/yalhQplODURQXamM9zIf

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

%matplotlib inline

In [None]:
X_train = pd.read_csv('C:\\Users\\johan\\git\\Python\\Kaggle\\Churn\\X_train.csv')
X_test = pd.read_csv('C:\\Users\\johan\\git\\Python\\Kaggle\\Churn\\X_test.csv')
y_train = pd.read_csv('C:\\Users\\johan\\git\\Python\\Kaggle\\Churn\\y_train.csv')
y_test = pd.read_csv('C:\\Users\\johan\\git\\Python\\Kaggle\\Churn\\test_label\\y_test.csv')

df = pd.merge(X_train, y_train, on='CustomerId', how='outer')

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().any()

In [None]:
df.describe().map(lambda x: '{:,.3f}'.format(x))

In [None]:
df['Exited'].value_counts(normalize=True)

In [None]:
for col in df:
    print(col, df[col].is_unique)


In [None]:
for col in df:
    print(col, df[col].nunique())

In [None]:
df.set_index('CustomerId', inplace=True)

In [None]:
df['Gender'].value_counts()

In [None]:
df['Gender'] = df['Gender'].apply(lambda x: x.strip().capitalize())
df['Gender'].value_counts()

In [None]:
sns.barplot(df, x='Gender', y='Exited', hue='Gender', palette='Blues', legend=False, errorbar=None )

In [None]:
sns.countplot(df, x='Gender', hue='Exited', stat='percent', palette='Blues')

In [None]:
numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
categorical_features = ['Surname', 'Geography', 'Gender', 'HasCrCard', 'IsActiveMember']

numerical_df = df[numerical_features]
categorical_df = df[categorical_features]

In [None]:
numerical_df

In [None]:
categorical_df

In [None]:
# Searching fro outliers using std method
# pd.set_option('display.max_rows', 150)
mask = abs(numerical_df - numerical_df.mean()) > (3 * numerical_df.std())
outlier_std_df = numerical_df[mask].dropna(axis=0, how='all').dropna(axis=1, how='all')
outlier_std_df

In [None]:
outlier_std_df.count()

In [None]:
#use iqr method for outliers and see if results are different
q1 = numerical_df.quantile(.25)
q3 = numerical_df.quantile(.75)
iqr = q3 -q1

mask = (numerical_df < (q1 - iqr * 1.5)) | (numerical_df > (q3 + iqr * 1.5))
outlier_iqr_df = numerical_df[mask].dropna(axis=0, how='all').dropna(axis=1, how='all')
outlier_iqr_df



In [None]:
outlier_iqr_df.count()
outlier_std_df['Age'].value_counts()

In [None]:
indexes = outlier_iqr_df['Age'].dropna().index
df.loc[indexes].sort_values(by='Age', ascending=False).head(50)

In [None]:
outlier_iqr_df.dropna(axis=0, thresh=2)

In [None]:
idx = outlier_iqr_df['CreditScore'].dropna().index
df.loc[idx].head(20)

In [None]:

df['NumOfProducts'].value_counts()
df.groupby(by=['NumOfProducts', 'Geography'])['Exited'].mean()

In [None]:
corr = df[['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'Exited']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')

In [None]:
#Picking features and encoding categorical features
X_train.drop(['Surname', 'CustomerId', 'Balance', 'EstimatedSalary', 'CreditScore'], axis=1, inplace=True)
X_train = pd.get_dummies(X_train, columns=['Geography', 'Gender'])

X_test.drop(['Surname', 'CustomerId', 'Balance', 'EstimatedSalary', 'CreditScore'], axis=1, inplace=True)
X_test = pd.get_dummies(X_test, columns=['Geography', 'Gender'])

y_train.drop('CustomerId', axis=1, inplace=True)
y_test.drop('CustomerId', axis=1, inplace=True)

In [None]:
#Scaling data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#Build model
model = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=1)
model.fit(X_train, y_train.values.ravel())
y_pred = model.predict(X_test)

In [None]:
accuracy_score(y_true=y_test, y_pred=y_pred)

print(classification_report(y_test, y_pred, digits=4))

In [None]:
print(y_train)

In [None]:
print(y_pred)

In [None]:

df2 = pd.merge(X_test, y_test, on='CustomerId', how='outer')
final_df = pd.concat([df, df2], axis=0)
final_df.to_csv("churn_dataset.csv")

In [None]:
final_df