In [63]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [3]:
train = pd.read_csv('train_0OECtn8.csv')

In [58]:
test_data = pd.read_csv('test_1zqHu22.csv')

In [6]:
print(f'''Number of rows : {train.shape[0]}
Number of columns : {train.shape[1]}''')

Number of rows : 89197
Number of columns : 10


## Exploratory data analysis

In [9]:
def unique_val(data,columnName_list):
    unique_values = []
    for col in list(data.columns):
        unique_values.append(data[col].nunique())
    return unique_values

In [11]:
def null_count(data):
    return pd.DataFrame({'Features':data.columns,
                 'Data type':data.dtypes.values,
                 'NaN count':data.isnull().sum().values,
                 'NaN percentage':(data.isnull().sum().values/data.shape[0])*100,
                'Unique_count':unique_val(data,list(data.columns))})


In [12]:
null_count(train)

Unnamed: 0,Features,Data type,NaN count,NaN percentage,Unique_count
0,row_id,int64,0,0.0,89197
1,user_id,int64,0,0.0,27734
2,category_id,int64,0,0.0,47
3,video_id,int64,0,0.0,175
4,age,int64,0,0.0,58
5,gender,object,0,0.0,2
6,profession,object,0,0.0,3
7,followers,int64,0,0.0,17
8,views,int64,0,0.0,43
9,engagement_score,float64,0,0.0,229


In [13]:
train.head()

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score
0,1,19990,37,128,24,Male,Student,180,1000,4.33
1,2,5304,32,132,14,Female,Student,330,714,1.79
2,3,1840,12,24,19,Male,Student,180,138,4.35
3,4,12597,23,112,19,Male,Student,220,613,3.77
4,5,13626,23,112,27,Male,Working Professional,220,613,3.13


## Data Cleaning and preprocessing

In [16]:
train = train.iloc[:,1:]

In [19]:
train = pd.get_dummies(data=train, columns=['gender','profession'])
train.shape

(89197, 12)

In [20]:
train.head(2)

Unnamed: 0,user_id,category_id,video_id,age,followers,views,engagement_score,gender_Female,gender_Male,profession_Other,profession_Student,profession_Working Professional
0,19990,37,128,24,180,1000,4.33,0,1,0,1,0
1,5304,32,132,14,330,714,1.79,1,0,0,1,0


In [22]:
train.columns

Index(['user_id', 'category_id', 'video_id', 'age', 'followers', 'views',
       'engagement_score', 'gender_Female', 'gender_Male', 'profession_Other',
       'profession_Student', 'profession_Working Professional'],
      dtype='object')

In [23]:
train = train[['user_id', 'category_id', 'video_id', 'age', 'followers', 'views',
        'gender_Female', 'gender_Male', 'profession_Other',
       'profession_Student', 'profession_Working Professional','engagement_score']]

In [24]:
train.head(2)

Unnamed: 0,user_id,category_id,video_id,age,followers,views,gender_Female,gender_Male,profession_Other,profession_Student,profession_Working Professional,engagement_score
0,19990,37,128,24,180,1000,0,1,0,1,0,4.33
1,5304,32,132,14,330,714,1,0,0,1,0,1.79


## Train test split

In [59]:
y= train.iloc[:,-1:]
x = train.iloc[:,:-1]

In [62]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=13)

## Model Building

In [78]:
regressor = RandomForestRegressor(n_estimators = 100,random_state=42)
regressor.fit(x_train,y_train.values.ravel())


RandomForestRegressor(random_state=42)

In [79]:
y_pred = regressor.predict(x_test)

In [80]:
y_pred

array([3.6256, 3.4426, 4.1425, ..., 4.2664, 3.7303, 3.4562])

In [81]:
r2_score(y_test,y_pred)

0.35733488700292015

## Final Submission

In [85]:
def data_preprocessing(data):
    data = data.iloc[:,1:]
    data = pd.get_dummies(data=data, columns=['gender','profession'])
    data = data[['user_id', 'category_id', 'video_id', 'age', 'followers', 'views',
        'gender_Female', 'gender_Male', 'profession_Other',
       'profession_Student', 'profession_Working Professional']]
    
    return data

testing = data_preprocessing(test)

In [83]:
regressor.fit(x,y.values.ravel())
y_pred = regressor.predict(testing)

In [84]:
y_pred

array([4.0102, 2.9448, 2.717 , ..., 3.2695, 3.8974, 3.1108])

In [53]:
submission = test[['row_id']]

In [54]:
submission['engagement_score'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['engagement_score'] = y_pred


In [51]:
submission = pd.DataFrame(submission)

In [55]:
submission

Unnamed: 0,row_id,engagement_score
0,89198,4.030400
1,89199,3.022067
2,89200,2.743600
3,89201,3.786333
4,89202,1.686333
...,...,...
11116,100314,3.189800
11117,100315,3.171733
11118,100316,3.271600
11119,100317,3.885733


In [57]:
submission.to_csv('submission.csv',index=False)

In [82]:
testing

Unnamed: 0,user_id,category_id,video_id,age,followers,views,gender_Female,gender_Male,profession_Other,profession_Student,profession_Working Professional
0,7986,12,42,14,180,138,0,1,0,1,0
1,11278,34,115,14,230,840,0,1,0,1,0
2,17245,8,110,44,280,628,1,0,0,0,1
3,9851,16,137,18,270,462,0,1,0,1,0
4,16008,34,96,47,230,840,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
11116,26336,25,140,21,240,317,0,1,0,1,0
11117,6772,8,100,19,280,628,1,0,0,1,0
11118,2042,16,98,22,270,462,0,1,0,1,0
11119,24626,8,16,33,280,628,0,1,1,0,0
