In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
np.random.seed(2)

data = pd.read_csv('/kaggle/input/online-shoppers-intention/online_shoppers_intention.csv')
data.head()

* "Administrative", "Administrative Duration", "Informational", "Informational Duration", "Product Related" and "Product Related Duration" represent the number of different types of pages visited by the visitor in that session and total time spent in each of these page categories. 
* The value of "Bounce Rate" feature for a web page refers to the percentage of visitors who enter the site from that page and then leave ("bounce") without triggering any other requests to the analytics server during that session. 
* The value of "Exit Rate" feature for a specific web page is calculated as for all pageviews to the page, the percentage that were the last in the session. 
* The "Page Value" feature represents the average value for a web page that a user visited before completing an e-commerce transaction.
* The "Special Day" feature indicates the closeness of the site visiting time to a specific special day (e.g. Mother’s Day, Valentine's Day) in which the sessions are more likely to be finalized with transaction. 
*  The dataset also includes operating system, browser, region, traffic type, visitor type as returning or new visitor, a Boolean value indicating whether the date of the visit is weekend, and month of the year.



In [None]:
data.shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(data['Month'],hue=data.Revenue)
plt.title('Revenue Rate for A Year', size = 15)
L=plt.legend(loc = 'best')
L.get_texts()[0].set_text('No Revenue')
L.get_texts()[1].set_text('Revenue')
plt.show()

In [None]:
cor = data.corr()
plt.figure(figsize=(20,10))
sns.heatmap(cor, vmin=-0.5, cmap="YlGnBu")

**Data Preparation for Modelling**

In [None]:
#Check null values

data.isnull().sum().sum()

In [None]:
#Drop unnecessary feature

data = data.drop(['BounceRates', 'ExitRates', 'SpecialDay',  'OperatingSystems', 'Browser', 'VisitorType', 'Month', 'Region', 'TrafficType', 'Weekend'], axis = 1)

In [None]:
## Correlation Matrix
sns.set(style="white", font_scale=2)

# Compute the correlation matrix
corr = data.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(18, 15))
f.suptitle("Correlation Matrix", fontsize = 40)

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

Since, the correlation of feature is not significant, we can't throw that.

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

data['Revenue'] = le.fit_transform(data['Revenue'])

In [None]:
fraud_indices = np.array(data[data.Revenue == 1].index)
number_records_fraud = len(fraud_indices)
print(number_records_fraud)

In [None]:
normal_indices = data[data.Revenue == 0].index

In [None]:
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace=False)
random_normal_indices = np.array(random_normal_indices)
print(len(random_normal_indices))

In [None]:
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])
print(len(under_sample_indices))

In [None]:
under_sample_data = data.iloc[under_sample_indices,:]

In [None]:
X_undersample = under_sample_data.iloc[:,under_sample_data.columns != 'Revenue']
y_undersample = under_sample_data.iloc[:,under_sample_data.columns == 'Revenue']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_undersample, y_undersample, test_size=0.2)

In [None]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
print(X_train)

In [None]:
print(y_train)

In [None]:
X_train.shape

In [None]:
X_test.shape

**Modelling** : Machine Learning 

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
accuracy_score(y_test, y_pred)