In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

### Reading the data and extracting cross validation set

In [2]:
df = pd.read_csv("C:/Users/HP/Documents/AV interns/Hackathon datasets/intern hackathon/train_yhC5guX/train.csv")
test = pd.read_csv("C:/Users/HP/Documents/AV interns/Hackathon datasets/intern hackathon/test_Dlahabb/test.csv")

If you look deep into the test data set, you will observe that there is a coherence between training data set and test data set. If a particular user 'A' is present in the training data set, then 'A' is present in test data set also. If you have a closer look at any particular user, you will find that all the data available after a certain time stamp is kept in the test data set. In other words, the most recent data available about an user is kept in test data set.
I will make use of this fact in extracting cross validation set and will keep 30 most recent data points related to each user in the cross validation set. 

In [3]:
grouped =  df.sort_values(['user_id','ts_listen'])
grouped =  grouped.groupby(grouped.user_id)
user_intersection = (np.intersect1d(df['user_id'],test['user_id']))

In [4]:
df_cv = grouped.tail(30)
df_cv.shape
droplist = list(df_cv.ID)
df_train = df.drop(droplist)

### Dividing each day in different slots


In [5]:
hours = pd.to_datetime(df_train.ts_listen, format = '%Y-%m-%d %H:%M:%S').dt.hour
slots = [-1,6,9,18,22,24]
hour_slots = pd.cut(hours,slots)
df_train['time_of_day']= hour_slots

In [6]:
hours = pd.to_datetime(df_cv.ts_listen, format = '%Y-%m-%d %H:%M:%S').dt.hour
slots = [-1,6,9,18,22,24]
hour_slots = pd.cut(hours,slots)
df_cv['time_of_day']= hour_slots

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [7]:
hours = pd.to_datetime(test.ts_listen, format = '%Y-%m-%d %H:%M:%S').dt.hour
slots = [-1,6,9,18,22,24]
hour_slots = pd.cut(hours,slots)
test['time_of_day']= hour_slots

### Creating a feature which incorporates the number of times a user listens to a song in a particular genre

In [8]:
genre_count = df_train[['media_id','user_id','genre_id']].groupby(['user_id','genre_id']).agg('count').reset_index()
genre_count.columns = ['user_id','genre_id','user_genre_count']

In [9]:
df_train= df_train.merge(genre_count,how = 'left',on = ['user_id','genre_id'])
df_cv= df_cv.merge(genre_count,how = 'left',on = ['user_id','genre_id'])

In [10]:
df_cv.user_genre_count.fillna(-1,inplace=True)

In [11]:
test= test.merge(genre_count,how = 'left',on = ['user_id','genre_id'])

In [12]:
test.user_genre_count.fillna(-1,inplace=True)

### Creating feature user_context_frequency which incorporates the number of times an user listens to a particular context

In [13]:
user_context = df_train[['platform_family','user_id','context_type']].groupby(['user_id','context_type']).agg('count').reset_index()

In [14]:
user_context.columns = ['user_id','context_type','user_context_frequency']

In [15]:
df_train = df_train.merge(user_context,how='left',on=['user_id','context_type'])

In [16]:
df_cv = df_cv.merge(user_context,how='left',on=['user_id','context_type'])

In [17]:
df_cv.user_context_frequency.fillna(-1,inplace=True)

In [18]:
test = test.merge(user_context,how='left',on=['user_id','context_type'])

In [19]:
test.user_context_frequency.fillna(-1,inplace=True)

### Creating a feature which incorporates how many times a user pauses song while listening

In [20]:
dflisten = df_train[['platform_name','user_id','media_id','listen_type']].groupby(['user_id','media_id','listen_type']).agg('count').reset_index()

In [21]:
dflisten.columns= ['user_id','media_id','listen_type','flow_break_frequency']

In [22]:
dflisten = dflisten.loc[(dflisten.flow_break_frequency>=5) & (dflisten.listen_type == 1)]

In [23]:
dflisten.drop('listen_type',axis=1,inplace=True)

In [24]:
df_train = df_train.merge(dflisten,how='left',on=['user_id','media_id'])

In [25]:
df_train.flow_break_frequency.fillna(-1,inplace=True)

In [26]:
df_cv = df_cv.merge(dflisten,how='left',on=['user_id','media_id'])

In [27]:
df_cv.flow_break_frequency.fillna(-1,inplace=True)

In [28]:
test = test.merge(dflisten,how='left',on=['user_id','media_id'])

In [29]:
test.flow_break_frequency.fillna(-1,inplace=True)

### Creating a feature which tells whether the particular album is hit or not

In [30]:
hit_or_not = df_train.album_id.value_counts()>1000
hit_or_not = pd.DataFrame(hit_or_not).reset_index()

In [31]:
hit_or_not.columns= ['album_id','hit']

In [32]:
df_train=df_train.merge(hit_or_not,how='left',on='album_id')

In [33]:
df_cv=df_cv.merge(hit_or_not,how='left',on='album_id')

In [34]:
df_cv.hit.fillna(-1,inplace=True)

In [35]:
test=test.merge(hit_or_not,how='left',on='album_id')

In [36]:
test.hit.fillna(-1,inplace=True)

### Creating a feature which indicates whether an album is hit in that particular genre or not 

In [37]:
genre_album_count = df[['platform_name','genre_id','album_id']].groupby(['genre_id','album_id']).agg('count').reset_index()

In [38]:
newdf=genre_album_count.loc[(genre_album_count.platform_name>=100)]

In [39]:
newdf['hit_album_in_genre'] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [40]:
newdf.drop('platform_name',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [41]:
df_train = df_train.merge(newdf,how='left',on=['genre_id','album_id'])

In [42]:
df_train.hit_album_in_genre.fillna(False,inplace=True)

In [43]:
df_cv = df_cv.merge(newdf,how='left',on=['genre_id','album_id'])

In [44]:
df_cv.hit_album_in_genre.fillna(False,inplace=True)

In [45]:
test = test.merge(newdf,how='left',on=['genre_id','album_id'])

In [46]:
test.hit_album_in_genre.fillna(False,inplace=True)

In [47]:
 df_train.drop(['ts_listen','release_date','platform_name','platform_family','ID','hit'],axis=1,inplace=True)

In [48]:
df_cv.drop(['ts_listen','release_date','platform_name','platform_family','ID','hit'],axis=1,inplace=True)

In [49]:
 test.drop(['ts_listen','release_date','platform_name','platform_family','ID','hit'],axis=1,inplace=True)

In [50]:
df_train.columns

Index(['genre_id', 'media_id', 'album_id', 'context_type', 'media_duration',
       'listen_type', 'user_gender', 'user_id', 'artist_id', 'user_age',
       'is_listened', 'time_of_day', 'user_genre_count',
       'user_context_frequency', 'flow_break_frequency', 'hit_album_in_genre'],
      dtype='object')

In [51]:
df_train =  pd.get_dummies(df_train,columns=['hit_album_in_genre'],drop_first=True)

In [52]:
df_cv =  pd.get_dummies(df_cv,columns=['hit_album_in_genre'],drop_first=True)

In [53]:
test =  pd.get_dummies(test,columns=['hit_album_in_genre'],drop_first=True)

In [54]:
df_train =  pd.get_dummies(df_train,columns=['time_of_day'],drop_first=True)

In [55]:
df_cv =  pd.get_dummies(df_cv,columns=['time_of_day'],drop_first=True)

In [56]:
test =  pd.get_dummies(test,columns=['time_of_day'],drop_first=True)

In [57]:
test.shape

(554932, 18)

# implementation

In [58]:
cv_y = df_cv['is_listened']

In [59]:
cv_x = df_cv.drop(['is_listened'],axis=1)

In [60]:
train_y = df_train['is_listened']

In [61]:
train_x = df_train.drop(['is_listened'],axis=1)

# Logistic Regression

In [62]:
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression()
model_lr.fit(train_x,train_y)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [63]:
pred_lr = pd.Series(model_lr.predict(test))
##pred_lr.to_csv("C:/Users/HP/Desktop/intern hackathon/binary prediction/pred_lr.csv")

In [64]:
from sklearn.metrics import accuracy_score
accuracy_score(cv_y,model_lr.predict(cv_x))

0.65335770716039432

In [65]:
cv_y.value_counts()

1    321324
0    164990
Name: is_listened, dtype: int64

In [66]:
baseline_accuracy = 321324/(164990+321324)


In [67]:
baseline_accuracy

0.6607336001019917

In [68]:
from sklearn.metrics import log_loss

In [69]:
log_loss(cv_y,model_lr.predict_proba(cv_x))

0.65012383499240967

In [70]:
from sklearn.metrics import confusion_matrix

In [71]:
confusion_matrix(cv_y,model_lr.predict(cv_x))

array([[  5189, 159801],
       [  8776, 312548]])

# Random Forest

In [72]:
from sklearn.ensemble import RandomForestClassifier

In [73]:
model_rf = RandomForestClassifier(n_estimators=40,max_depth=12)

In [74]:
model_rf.fit(train_x,train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=40, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [75]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

In [76]:
log_loss(cv_y,model_rf.predict_proba(cv_x))

0.61015276745833791

In [77]:
accuracy_score(cv_y,model_rf.predict(cv_x))

0.67834362161072892

In [78]:
confusion_matrix(cv_y,model_rf.predict(cv_x))

array([[ 20230, 144760],
       [ 11666, 309658]])

In [79]:
basic_prediction = model_rf.predict_proba(test)

In [80]:
pred_rf = pd.Series(model_rf.predict(test))

In [81]:
#pred_rf.to_csv("C:/Users/HP/Desktop/intern hackathon/binary prediction/pred_rf.csv")

In [82]:
feature_impo_rf = model_rf.feature_importances_



In [83]:
feature_impo_rf

array([ 0.0331764 ,  0.05178738,  0.04974363,  0.13165976,  0.05247785,
        0.14389956,  0.01561831,  0.12153661,  0.04315251,  0.12885213,
        0.07178543,  0.08272189,  0.00176577,  0.01306177,  0.00754257,
        0.00809623,  0.03660235,  0.00651984])

# storing submission

In [None]:
#test1 = pd.read_csv("C:/Users/HP/Desktop/intern hackathon/test_Dlahabb/test.csv")
#ID = test1['ID']

In [None]:
#list1 = []
#for elements in basic_prediction:
    #list1.append(elements[1])

In [None]:
#list1 = np.array(list1)

In [None]:
#submission_basicmodel = pd.DataFrame(ID)
#submission_basicmodel['is_listened'] = list1


In [None]:
#submission_basicmodel.to_csv('C:/Users/HP/Desktop/intern hackathon/final_solution.csv')

# Naive bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
model_nb = GaussianNB()
model_nb.fit(train_x,train_y)

In [None]:
pred_nb =  pd.Series(model_nb.predict(test))

In [None]:
accuracy_score(cv_y,model_nb.predict(cv_x))

In [None]:
log_loss(cv_y,model_nb.predict_proba(cv_x))

In [None]:
confusion_matrix(cv_y,model_rf.predict(cv_x))

In [None]:
#pred_nb.to_csv("C:/Users/HP/Desktop/intern hackathon/binary prediction/pred_nb.csv")

# boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
model_xgb = GradientBoostingClassifier()
model_xgb.fit(train_x,train_y)

In [None]:
pred_gb =  pd.Series(model_xgb.predict(test))

In [None]:
accuracy_score(cv_y,model_xgb.predict(cv_x))

In [None]:
log_loss(cv_y,model_xgb.predict_proba(cv_x))

In [None]:
confusion_matrix(cv_y,model_rf.predict(cv_x))

In [None]:
#pred_gb.to_csv("C:/Users/HP/Desktop/intern hackathon/binary prediction/pred_gb.csv")

# extra tree gini

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model_etc = ExtraTreesClassifier()
model_etc.fit(train_x,train_y)

In [None]:
pred_etc =  pd.Series(model_etc.predict(test))

In [None]:
accuracy_score(cv_y,model_etc.predict(cv_x))

In [None]:
log_loss(cv_y,model_etc.predict_proba(cv_x))

In [None]:
confusion_matrix(cv_y,model_etc.predict(cv_x))

In [None]:
#pred_etc.to_csv("C:/Users/HP/Desktop/intern hackathon/binary prediction/pred_etc.csv")

# voting ensemble

In [None]:
voting_prediction = pd.Series(index=range(0,554931))


In [None]:
superior_predictors = [pred_rf,pred_gb,]
inferior_predictors = [pred_lr,pred_etc,pred_nb]

In [None]:
for i in range(0,554931):
    a = 0
    b = 0
    for predictor in superior_predictors:
        if (predictor[i]) == 0:
            a = a + 2
        else:
            b = b + 2
    for predictor in inferior_predictors:
        if (predictor[i]) == 0:
            a = a+1
        else:
            b = b+1
    if (a>b):
        voting_prediction[i] = 0
    elif (b>a):
        voting_prediction[i] = 1
    else:
        voting_prediction[i] = pred_rf[i]


In [None]:
voting_prediction.value_counts()

In [None]:
pred_nb.value_counts()

# stacking ensemble

In [None]:
from sklearn.model_selection import KFold

In [None]:
kf = KFold(n_splits=5)

In [None]:
splits = kf.split(df_train)

In [None]:
splits_list = []
predictions_list= []

In [None]:
for element in splits:
    splits_list.append(element)

In [None]:
models_list = [model_xgb,model_rf,model_etc,model_nb]

In [None]:
for elements in splits_list:
    training_df = df_train.iloc[elements[0]]
    test_df = df_train.iloc[elements[1]]
    training_df_x = training_df.drop('is_listened',axis=1)
    test_df_x = test_df.drop('is_listened',axis=1)
    training_df_y = training_df['is_listened']
    model_xgb.fit(training_df_x,training_df_y)
    fold_prediction = model_xgb.predict(test_df_x)
    predictions_list.append(fold_prediction)

In [None]:
indices = []
for element in splits_list:
    indices.append(element[1])
indices = np.concatenate(indices,axis=0)

In [None]:
xgb_predictions = pd.DataFrame(np.concatenate(predictions_list,axis=0),index=indices)

In [None]:
xgb_predictions.columns = ['pred_xgb']

In [None]:
xgb_predictions