In [1]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.metrics import accuracy_score

In [2]:
# Read the training and test csv files and convert them to a dataframe each
data = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

In [3]:
# Cleaning training data
# Replace special character * from columns traffictype, browser, operatingsystems
data['traffictype'] = data['traffictype'].replace("*", 2)
data['traffictype'] = pd.to_numeric(data['traffictype'])
data['browser'] = data['browser'].replace("*", 2)
data['browser'] = pd.to_numeric(data['browser'])
data['operatingsystems'] = data['operatingsystems'].replace("*", 2)
data['operatingsystems'] = pd.to_numeric(data['operatingsystems'])

# Convert the column - weekend to integer from boolean value
data['weekend'] = data['weekend'].astype(int)

# Convert the categorical column month to numeric representation of months.
d = dict((v,k) for k,v in zip(range(1, 13), data.month.unique()))
# create column
data['month'] = data['month'].map(d)

In [4]:
# The feature columns are derived in X and the target column revenue is derived in y.
X = data.drop(columns=['revenue']).copy()
y = data['revenue'].copy()
encoded_data = pd.get_dummies(X)

In [5]:
# outliers detection and capping:

print("Highest allowed",encoded_data['exitrates'].mean() + 3*encoded_data['exitrates'].std())
print("Lowest allowed",encoded_data['exitrates'].mean() - 3*encoded_data['exitrates'].std())

upper_limit= encoded_data['exitrates'].mean() + 3*encoded_data['exitrates'].std()
lower_limit=encoded_data['exitrates'].mean() - 3*encoded_data['exitrates'].std()

encoded_data['exitrates'] = np.where(
    encoded_data['exitrates']>.10,
    .10,
    np.where(
        encoded_data['exitrates']<lower_limit,
        lower_limit,
        encoded_data['exitrates']
    )
)

print(encoded_data['exitrates'].describe())





print("Highest allowed",encoded_data['bouncerates'].mean() + 3*encoded_data['bouncerates'].std())
print("Lowest allowed",encoded_data['bouncerates'].mean() - 3*encoded_data['bouncerates'].std())

upper_limit= encoded_data['bouncerates'].mean() + 3*encoded_data['bouncerates'].std()
lower_limit=encoded_data['bouncerates'].mean() - 3*encoded_data['bouncerates'].std()

encoded_data['bouncerates'] = np.where(
    encoded_data['bouncerates']>.04,
    .04,
    np.where(
        encoded_data['bouncerates']<lower_limit,
        lower_limit,
        encoded_data['bouncerates']
    )
)

print(encoded_data['bouncerates'].describe())





print("Highest allowed",encoded_data['pagevalues'].mean() + 3*encoded_data['pagevalues'].std())
print("Lowest allowed",encoded_data['pagevalues'].mean() - 3*encoded_data['pagevalues'].std())

upper_limit= encoded_data['pagevalues'].mean() + 3*encoded_data['pagevalues'].std()
lower_limit=encoded_data['pagevalues'].mean() - 3*encoded_data['pagevalues'].std()

encoded_data['pagevalues'] = np.where(
    encoded_data['pagevalues']>10,
    10,
    np.where(
        encoded_data['pagevalues']<lower_limit,
        lower_limit,
        encoded_data['pagevalues']
    )
)

print(encoded_data['pagevalues'].describe())


encoded_data['pagevalues'] = np.where(
    encoded_data['pagevalues']>10,
    10,
    np.where(
        encoded_data['pagevalues']<lower_limit,
        lower_limit,
        encoded_data['pagevalues']
    )
)

Highest allowed 0.18820489476478885
Lowest allowed -0.10359015769414104
count    40000.000000
mean         0.035407
std          0.029742
min          0.000000
25%          0.013389
50%          0.025000
75%          0.050000
max          0.100000
Name: exitrates, dtype: float64
Highest allowed 0.16756717675784605
Lowest allowed -0.12362973410874561
count    40000.000000
mean         0.010520
std          0.014172
min          0.000000
25%          0.000000
50%          0.003304
75%          0.016667
max          0.040000
Name: bouncerates, dtype: float64
Highest allowed 59.6269001904745
Lowest allowed -46.63697372849511
count    40000.000000
mean         2.114995
std          3.915794
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         10.000000
Name: pagevalues, dtype: float64


#### Model  - SVC

In [6]:
# Using Oversampling
oversample = SMOTE(random_state = 0)
X_os, y_os = oversample.fit_resample(encoded_data, y)

In [7]:
svc_over_sample=SVC()

svc_over_sample.fit(X_os, y_os)

SVC()

In [None]:
# define the undersampling method
undersample = TomekLinks()

# transform the dataset
X_us, y_us = undersample.fit_resample(encoded_data, y)

In [None]:
svc_under_sample=SVC()
svc_under_sample.fit(X_us, y_us)

In [8]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

cv = KFold(n_splits=4, random_state=1, shuffle=True)
# create model
# evaluate model
scores = cross_val_score(svc_over_sample, X_os, y_os, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Accuracy: 0.532 (0.002)


In [9]:
scores = cross_val_score(svc_over_sample, X_os, y_os, scoring='f1', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Accuracy: 0.190 (0.005)


In [12]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

cv = KFold(n_splits=4, random_state=1, shuffle=True)
# create model
# evaluate model
scores = cross_val_score(svc_under_sample, X_us, y_us, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Accuracy: 0.830 (0.003)


In [None]:
scores = cross_val_score(svc_under_sample, X_us, y_us, scoring='f1', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

In [10]:
data_test['traffictype'] = data_test['traffictype'].replace("*", 4)
data_test['traffictype'] = pd.to_numeric(data_test['traffictype'])
data_test['browser'] = data_test['browser'].replace("*", 2)
data_test['browser'] = pd.to_numeric(data_test['browser'])
data_test['operatingsystems'] = data_test['operatingsystems'].replace("*", 2)
data_test['operatingsystems'] = pd.to_numeric(data_test['operatingsystems'])
data_test['weekend'] = data_test['weekend'].astype(int)
data_test.month.value_counts()
d = dict((v,k) for k,v in zip(range(1, 13), data_test.month.unique()))
# create column
data_test['month'] = data_test['month'].map(d)

In [11]:
X1 = pd.get_dummies(data_test)

In [12]:
y_pred_test = svc_over_sample.predict(X1)
y_pred_test = pd.Series(y_pred_test)
y_pred_test.value_counts(normalize=True)

0    0.9364
1    0.0636
dtype: float64

In [33]:
y_pred_test = svc_under_sample.predict(X1)
y_pred_test = pd.Series(y_pred_test)
y_pred_test.value_counts(normalize=True)

0    1.0
dtype: float64

#### LDA

In [16]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA()
X_lda = lda.fit_transform(encoded_data,y)

In [17]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(lda, encoded_data, y, scoring='accuracy', cv=cv, n_jobs=-1)
# summarize result
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Mean Accuracy: 0.879 (0.002)


In [18]:
scores = cross_val_score(lda, encoded_data, y, scoring='f1', cv=cv, n_jobs=-1)
# summarize result
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Mean Accuracy: 0.660 (0.006)


In [59]:
y_pred_test = lda.predict(X1)
y_pred_test = pd.Series(y_pred_test)
y_pred_test.value_counts(normalize=True)

0    0.8058
1    0.1942
dtype: float64

In [1]:
scores = cross_val_score(lda, encoded_data, y, scoring='f1', cv=cv, n_jobs=-1)
# summarize result
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

NameError: name 'cross_val_score' is not defined