# Modeling
in previous notebook we investigated the linear algorithms for regression problem which we discussed previously. in this notebook we are going to discuss non-linear approaches for regression problem. the non-linear algorithms which we will check in this notebook are:
- k-nearest neighbor
- decision tree
- random forest

and in the next notebooks we will discuss classification and clustering algorithms.

the reason behind the dividing the notebooks is making the understanding of principles more easily.

----

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import warnings
import matplotlib
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 200)
import seaborn as sns
from openpyxl import load_workbook
np.set_printoptions(suppress=True)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
from sklearn import preprocessing
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from tqdm import tqdm

In [2]:
xls = pd.ExcelFile('data/Main Dataset V3.0 .xlsx')
ad_post = pd.read_excel(xls, 'Ad-Post')
ad_story = pd.read_excel(xls, 'Ad-Story')
influencer = pd.read_excel(xls, 'Influencer')
leaders_post = pd.read_excel(xls, 'Leaders-Post')
leaders_story = pd.read_excel(xls, 'Leaders-Story')
post = pd.read_excel(xls, 'Post')
story = pd.read_excel(xls, 'Story')
print('Datasets Loaded Completely.')

Datasets Loaded Completely.


In [3]:
#dummying dataset

# advertising posts
dummy_field = pd.get_dummies(ad_post['field'], prefix='field')
ad_post_dummy = pd.concat([ad_post, dummy_field], axis=1)
ad_post_dummy.drop(['field'], axis=1, inplace=True)

# advertising stories
dummy_field = pd.get_dummies(ad_story['field'], prefix='field')
ad_story_dummy = pd.concat([ad_story, dummy_field], axis=1)
ad_story_dummy.drop(['field'], axis=1, inplace=True)

#influencer
dummy_gender = pd.get_dummies(influencer['gender'], prefix='gender')
dummy_field = pd.get_dummies(influencer['field'], prefix='field')
influencer_dummy = pd.concat([influencer, dummy_gender, dummy_field], axis=1)
influencer_dummy.drop(['gender', 'field'], axis=1, inplace=True)

#leaders posts
dummy_gender = pd.get_dummies(leaders_post['gender'], prefix='gender')
leaders_post_dummy = pd.concat([leaders_post, dummy_gender], axis=1)
leaders_post_dummy.drop(['gender'], axis=1, inplace=True)

In [4]:
# label encoding dataset

# advertising posts
labels, _ = pd.factorize(ad_post['field'])
ad_post_labelencoded = ad_post
ad_post_labelencoded['field_labelencoded'] = labels.tolist()

# advertising stories
labels, _ = pd.factorize(ad_story['field'])
ad_story_labelencoded = ad_story
ad_story_labelencoded['field_labelencoded'] = labels.tolist()

# influencer
labels, _ = pd.factorize(influencer['gender'])
influencer_labelencoded = influencer
influencer_labelencoded['gender_labelencoded'] = labels.tolist()
labels, _ = pd.factorize(influencer['field'])
influencer_labelencoded['field_labelencoded'] = labels.tolist()

# leaders post
labels, _ = pd.factorize(leaders_post['gender'])
leaders_post_labelencoded = leaders_post
leaders_post_labelencoded['gender_labelencoded'] = labels.tolist()

In [5]:
ad_post_y = np.asarray(ad_post_dummy[['cost']])
ad_post_x = np.asarray(ad_post_dummy[['follower', 'view', 'field_art & culture', 'field_fact', 'field_video', 'field_women']])

ad_story_y = np.asarray(ad_story_dummy[['cost']])
ad_story_x = np.asarray(ad_story_dummy[['view', 'follower', 'action', 'interaction', 'impression', 'field_art & culture', 'field_fact', 'field_health',
                                       'field_news', 'field_video', 'field_women']])

influencer_y = np.asarray(influencer_dummy[['cost']])
influencer_x = np.asarray(influencer_dummy[['follower', 'view', 'action', 'impression', 'cta', 'interaction', 'gender_family', 'gender_female', 'gender_male',
                                           'field_cooking', 'field_health', 'field_lifestyle', 'field_sport', 'field_tourism']])

leaders_post_y = np.asarray(leaders_post_dummy[['cost']])
leaders_post_x = np.asarray(leaders_post_dummy[['follower', 'view', 'like', 'comment', 'share', 'save', 'profile_visit', 'reach', 'impression', 'gender_family',
                                               'gender_female', 'gender_male']])

### K-Nearest Neighbor Regressor

#### Advertising Posts

In [7]:
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor

In [8]:
weights = ['uniform', 'distance']

In [38]:
print(len(np.arange(2,10)))
print(np.linspace(1, 2, 8))
# r_test = [0, .1, .8, .3, .9, .5, 1, .7]

8
[1.         1.14285714 1.28571429 1.42857143 1.57142857 1.71428571
 1.85714286 2.        ]


In [44]:
X_test.astype('float64')

array([[1000000.,    9435.,       0.,       0.,       1.,       0.],
       [1700000.,    7926.,       0.,       0.,       1.,       0.],
       [1700000.,   19433.,       0.,       1.,       0.,       0.],
       [ 224000.,    8424.,       1.,       0.,       0.,       0.],
       [1000000.,    8212.,       0.,       1.,       0.,       0.],
       [4500000.,   40630.,       0.,       1.,       0.,       0.],
       [1900000.,   16779.,       0.,       1.,       0.,       0.],
       [ 294000.,    9278.,       1.,       0.,       0.,       0.],
       [ 275000.,   14550.,       1.,       0.,       0.,       0.],
       [1200000.,   15631.,       0.,       0.,       1.,       0.],
       [1500000.,   29296.,       0.,       1.,       0.,       0.],
       [1100000.,   14346.,       0.,       1.,       0.,       0.],
       [ 194000.,   12609.,       1.,       0.,       0.,       0.],
       [ 874000.,   18645.,       1.,       0.,       0.,       0.]])

In [49]:
temp_lst = []
neighbors = np.arange(2, 10)
for i in tqdm(range(2, 9)):
    kf = KFold(n_splits = i)
    for train_index, test_index in kf.split(ad_post_x):
        X_train, X_test = ad_post_x[train_index], ad_post_x[test_index]
        y_train, y_test = ad_post_y[train_index], ad_post_y[test_index]
        for n, r in zip(np.arange(2, 9), np.linspace(1, 2, 8)):
            for w in weights:
                knr = KNeighborsRegressor(n_neighbors = n, weights = w)
                rnr = RadiusNeighborsRegressor(radius = r, weights = w)
                knr.fit(X_train, y_train)
                rnr.fit(X_train, y_train)
                temp_lst2 = []
                temp_lst2.append(i)
                temp_lst2.append(n)
                temp_lst2.append(r)
                temp_lst2.append(w)
                temp_lst2.append(knr.score(X_train, y_train))
                temp_lst2.append(rnr.score(X_train, y_train))
                temp_lst2.append(knr.score(X_test, y_test))
#                 temp_lst2.append(0)
                temp_lst2.append(rnr.score(X_test, y_test))
                temp_lst.append(temp_lst2)

temp_df = pd.DataFrame(temp_lst, columns=['k', '# of Neighbors (Only KNR)', '# of Radius (Only RNR)', 'Weights', 'KNR Train Score', 'RNR Train Score', 
                                          'KNR Test Score', 'RNR Test Score'])

temp_lst = []
for k in range(2, 9):
    for n_, r_ in zip(np.arange(2, 9), np.linspace(1, 2, 8)):
        for w_ in weights:
            temp_lst2 = []
            temp_lst2.append(k)
            temp_lst2.append(n_)
            temp_lst2.append(r_)
            temp_lst2.append(w_)
            temp_lst2.append(np.round(np.mean(temp_df[(temp_df['k'] == k) & (temp_df['# of Neighbors (Only KNR)'] == n_) & 
                                                      (temp_df['# of Radius (Only RNR)'] == r_) & (temp_df['Weights'] == w_)]['KNR Train Score']), decimals=4))
            temp_lst2.append(np.round(np.mean(temp_df[(temp_df['k'] == k) & (temp_df['# of Neighbors (Only KNR)'] == n_) & 
                                                      (temp_df['# of Radius (Only RNR)'] == r_) & (temp_df['Weights'] == w_)]['RNR Train Score']), decimals=4))
            temp_lst2.append(np.round(np.mean(temp_df[(temp_df['k'] == k) & (temp_df['# of Neighbors (Only KNR)'] == n_) & 
                                                      (temp_df['# of Radius (Only RNR)'] == r_) & (temp_df['Weights'] == w_)]['KNR Test Score']), decimals=4))
            temp_lst2.append(np.round(np.mean(temp_df[(temp_df['k'] == k) & (temp_df['# of Neighbors (Only KNR)'] == n_) & 
                                                      (temp_df['# of Radius (Only RNR)'] == r_) & (temp_df['Weights'] == w_)]['RNR Test Score']), decimals=4))
            temp_lst.append(temp_lst2)

nn_reg_eval_df = pd.DataFrame(temp_lst, columns=['k', '# of Neighbors (Only KNR)', '# of Radius (Only RNR)', 'Weights', 'KNR Train Score', 'RNR Train Score', 
                                                 'KNR Test Score', 'RNR Test Score'])
nn_reg_eval_df

  0%|          | 0/7 [00:00<?, ?it/s]


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [48]:
nn_reg_eval_df.nlargest(3, 'KNR Test Score')

Unnamed: 0,k,# of Neighbors (Only KNR),# of Radius (Only RNR),Weights,KNR Train Score,RNR Train Score,KNR Test Score,RNR Test Score
64,6,2,1.0,uniform,0.88,1.0,0.44,0.0
65,6,2,1.0,distance,1.0,1.0,0.43,0.0
17,3,2,1.0,distance,1.0,1.0,0.37,0.0
