In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Unziping Data
!7z e "./data/members.csv.7z"
!7z e "./data/songs.csv.7z"
!7z e "./data/test.csv.7z"
!7z e "./data/train.csv.7z"
!7z e "./data/song_extra_info.csv.7z"

In [None]:
# Load data
df = pd.read_csv('./train.csv')
df.head()

In [None]:
# Sampling Only Half the Data
df = df.sample(frac=0.1)

In [None]:
# Load and join songs data
songs = pd.read_csv('./songs.csv')
df = pd.merge(df, songs, on='song_id', how='left')
del songs

# Load and join members data
members = pd.read_csv('./members.csv')
df = pd.merge(df, members, on='msno', how='left')
del members

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
dtypes = pd.DataFrame(df.dtypes,columns=["Data Type"])

dtypes["Unique Values"]=df.nunique().sort_values(ascending=True)

dtypes["Null Values"]=df.isnull().sum()

dtypes["% null Values"]=df.isnull().sum()/len(df)

dtypes.sort_values(by="Null Values" , ascending=False).style.background_gradient(cmap='YlOrRd',axis=0)

**Visualizing Null Values**

In [None]:
plt.figure(figsize=(25,10))

sns.heatmap(df.isnull(),cbar=False,cmap='viridis')
plt.tick_params(axis='x', labelsize=25)

# **Filling Null Values**

In [None]:
# Strings with 'Unknown'
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(value = "unknown")
# Numerics with mean    
for col in df.select_dtypes(exclude=['object']).columns:
    df[col] = df[col].fillna(value = df[col].mean())

In [None]:
df.isna().sum()

# **Parsing Dates and Extracting new Date Features**

In [None]:
df['registration_init_time'] = pd.to_datetime(df['registration_init_time'], format='%Y%m%d', errors='ignore')
df['registration_init_time_year'] = df['registration_init_time'].dt.year
df['registration_init_time_month'] = df['registration_init_time'].dt.month
df['registration_init_time_day'] = df['registration_init_time'].dt.day


df['expiration_date'] = pd.to_datetime(df['expiration_date'],  format='%Y%m%d', errors='ignore')
df['expiration_date_year'] = df['expiration_date'].dt.year
df['expiration_date_month'] = df['expiration_date'].dt.month
df['expiration_date_day'] = df['expiration_date'].dt.day

In [None]:
# Dates to categoty
df['registration_init_time'] = df['registration_init_time'].astype('category')
df['expiration_date'] = df['expiration_date'].astype('category') 

In [None]:
df.head()

# **Encoding and Categorizating Columns**

In [None]:
# Object data to category
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category')
    
# Encoding categorical features
for col in df.select_dtypes(include=['category']).columns:
    df[col] = df[col].cat.codes

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Сorrelation matrix
plt.figure(figsize=[20,10])
sns.heatmap(df.corr(), annot=True)
plt.show()

# **Train and Test split**

In [None]:
target = df.pop('target')
train_data, test_data, train_labels, test_labels = train_test_split( df , target, test_size = 0.2)

print(train_data.shape)
print(train_labels.shape)
print(test_data.shape)
print(test_labels.shape)

# **Feature Selection with Random Forest**

In [None]:
# Model with the best estimator
RF = RandomForestClassifier()
RF.fit(train_data, train_labels)

RF_TrainScore = RF.score(train_data, train_labels)
RF_TrainScore

In [None]:
RF_TestScore = RF.score(test_data, test_labels)
RF_TestScore

In [None]:
y_pred_rf =RF.predict(test_data)

In [None]:
#Confusion matrix
import seaborn as sns 
from sklearn.metrics import confusion_matrix

cm_rf = confusion_matrix(test_labels, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt='g')

In [None]:
from sklearn.metrics import classification_report

print(classification_report(test_labels, y_pred_rf))

**Feature Selection**

In [None]:
df_plot = pd.DataFrame({'features': df.columns,
                        'importances': RF.feature_importances_})
df_plot = df_plot.sort_values('importances', ascending=False)

In [None]:
plt.figure(figsize=[15,10])
sns.barplot(x = df_plot.importances, y = df_plot.features)
plt.title('Importances of Features')
plt.show()

In [None]:
# Drop columns with importances < 0.04
df = df.drop(df_plot.features[df_plot.importances < 0.04].tolist(), axis=1)

In [None]:
# Selected columns
print(df.shape)
df.columns

In [None]:
# Сorrelation matrix
plt.figure(figsize=[20,10])
sns.heatmap(df.corr(), annot=True)
plt.show()

# **XGBoosting**

In [None]:
import xgboost as xgb

In [None]:
# Create model
XGB = xgb.XGBClassifier(learning_rate=0.1, max_depth=15, min_child_weight=5)
XGB.fit(train_data, train_labels)

XGB_TrainScore = XGB.score(train_data, train_labels)
XGB_TrainScore

In [None]:
XGB_TestScore = XGB.score(test_data, test_labels)

XGB_TestScore

In [None]:
y_pred_xgb =XGB.predict(test_data)

In [None]:
cm_lgbm = confusion_matrix(test_labels, y_pred_xgb)
sns.heatmap(cm_lgbm, annot=True, fmt='g')

In [None]:
print(classification_report(test_labels, y_pred_xgb))

# **Light LGBM**

In [None]:
import lightgbm as lgb

d_train = lgb.Dataset(train_data, label=train_labels)

In [None]:
params = {
        'objective': 'binary',
        'boosting': 'dart',
        'learning_rate': 0.2 ,
        'verbose': 0,
        'num_leaves': 100,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'num_rounds': 100,
        'metric' : 'auc'
    }

In [None]:
clf = lgb.train(params, d_train, 100)

In [None]:
y_predtrain_lgbm=clf.predict(train_data)

y_predtest_lgbm=clf.predict(test_data)

In [None]:
# Making Prediction for training and test sets
for i in range(0, train_data.shape[0]):
    if y_predtrain_lgbm[i]>=.5: 
        y_predtrain_lgbm[i]=1
    else:  
        y_predtrain_lgbm[i]=0
        
for i in range(0, test_data.shape[0]):
    if y_predtest_lgbm[i]>=.5: 
        y_predtest_lgbm[i]=1
    else:  
        y_predtest_lgbm[i]=0        

In [None]:
from sklearn.metrics import accuracy_score


LGBM_TrainScore = accuracy_score(y_predtrain_lgbm,train_labels)
LGBM_TestScore = accuracy_score(y_predtest_lgbm,test_labels)
#Print accuracy
print ("Test Accuracy with LGBM = ", LGBM_TrainScore)
print ("Test Accuracy with LGBM = ", LGBM_TestScore)

In [None]:
#Confusion matrix
import seaborn as sns 
from sklearn.metrics import confusion_matrix

cm_lgbm = confusion_matrix(test_labels, y_predtest_lgbm)
sns.heatmap(cm_lgbm, annot=True, fmt='g')

In [None]:
print(classification_report(test_labels, y_predtest_lgbm))

In [None]:
from catboost import CatBoostClassifier

# **CatBoost Classifier**

In [None]:
CatBoost = CatBoostClassifier(learning_rate=0.1, depth=10, iterations=300)
CatBoost.fit(train_data, train_labels)


CatBoost_TrainScore = CatBoost.score(train_data, train_labels)


In [None]:
CatBoost_TrainScore

In [None]:
CatBoost_TestScore = CatBoost.score(test_data, test_labels)
CatBoost_TestScore

In [None]:
# Predicting
y_pred_catboost = CatBoost.predict(test_data)

In [None]:
cm_catboost = confusion_matrix(test_labels, y_pred_catboost)
sns.heatmap(cm_lgbm, annot=True, fmt='g')

In [None]:
print(classification_report(test_labels, y_pred_catboost))

# **Comparing Boostings Results**

In [None]:
results = pd.DataFrame( [["Random Forest", RF_TrainScore, RF_TestScore ],
                       ["XGBoost", XGB_TrainScore ,XGB_TestScore ],
                        ["Light LGBM", LGBM_TrainScore ,LGBM_TestScore ],
                        ["CatBoost", CatBoost_TrainScore ,CatBoost_TestScore ]],
                       columns = ["Model","Training Accuracy %","Test Evaluation %"]).sort_values(by="Test Evaluation %",ascending=False)
results.style.background_gradient(cmap='BuPu')