### Import Packages and Reading of Data

In [16]:
import time
import warnings
import pickle

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.svm import SVR, SVC
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score, RepeatedKFold

pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")

In [17]:
# Reading of Excel
dataset = pd.read_csv('cleaned_others_dataset.csv')

In [18]:
dataset

Unnamed: 0,verified,location,followers_count,following_count,tweet_count,isFraud,un_no_of_char,un_special_char,un_uppercase,name_no_of_char,name_special_char,name_uppercase,des_no_of_usertags,des_no_of_hashtags,des_external_links,has_description,year,Apr,Aug,Dec,Feb,Jan,Jul,Jun,Mar,May,Nov,Oct,Sep,Fri,Mon,Sat,Sun,Thu,Tue,Wed
0,0,0,1997,5,17090,1,15,1,0,19,0,1,2,0,0,1,2016,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
1,0,0,1997,5,17090,1,15,1,0,19,0,1,2,0,0,1,2016,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
2,0,0,1997,5,17090,1,15,1,0,19,0,1,2,0,0,1,2016,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
3,0,0,1997,5,17090,1,15,1,0,19,0,1,2,0,0,1,2016,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
4,0,0,1997,5,17090,1,15,1,0,19,0,1,2,0,0,1,2016,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88127,1,1,803247,7,3159,0,9,1,1,10,1,0,0,0,0,0,2012,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
88128,1,1,803247,7,3159,0,9,1,1,10,1,0,0,0,0,0,2012,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
88129,1,1,803247,7,3159,0,9,1,1,10,1,0,0,0,0,0,2012,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
88130,1,1,803247,7,3159,0,9,1,1,10,1,0,0,0,0,0,2012,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1


### Feature Engineering

In [19]:
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns=["isFraud"]), dataset["isFraud"], test_size = 0.2, random_state=101)
print("Train set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)

Train set: (70505, 35) (70505,)
Test set: (17627, 35) (17627,)


In [20]:
scaler = StandardScaler()
fitted = scaler.fit(X_train)
X_train_fitted = fitted.transform(X_train)
X_test_fitted = fitted.transform(X_test)

In [21]:
x_train = pd.DataFrame(X_train_fitted, columns = X_train.columns)
x_test = pd.DataFrame(X_test_fitted, columns = X_test.columns)
y_train = pd.DataFrame(y_train, columns = ["isFraud"])
y_test = pd.DataFrame(y_test, columns = ["isFraud"])

### Charts

In [22]:
def feature_chart(feature_importance_df, model_type):
#     feature_importance_df = pd.DataFrame(model.coef_.T, x_train.columns.T,columns=['Feature Importance']).sort_values(by='Feature Importance', ascending=False)
    fig = feature_importance_df.plot(kind="barh", figsize=(20, 50))
    bg = fig.patch
    plt.gca().invert_yaxis()
    plt.grid()
#     plt.savefig(f'Charts\\{model_type} Feature Importance.png', dpi=300)
    plt.show()

### Support Vector Machine
* Model (Random Search + Grid Search)
* Error Metrics
* Plot for Feature Importance

#### Model: Random Search

In [23]:
%%time

# Model
svm_model = SVC()

space = dict()

# Kernel type to be used in the algorithm
space["kernel"] = ['poly', 'rbf', 'sigmoid']

# Degree of the polynomial kernel function
space["degree"] = [1, 3, 8]

# Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’
space["gamma"] = ['scale', 'auto']

# Regularisation parameter
space["C"] = [0.01, 0.1, 1, 10]

# Enable verbose output
space["verbose"] = [True, False]

# Define search
search = RandomizedSearchCV(svm_model, space, scoring='neg_mean_absolute_error', cv = 5, verbose = 2, random_state = 123, n_jobs = -1)

# Execute search
results_random = search.fit(x_train, np.ravel(y_train))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Wall time: 1h 40min 51s


In [24]:
# Save the model to disk
filename = '04 SVM (Random).sav'
pickle.dump(results_random, open(filename, 'wb'))

# Load the model from disk
# results_random = pickle.load(open(filename, 'rb'))

print('Best Score: %s' % results_random.best_score_)
print('Best Hyperparameters: %s' % results_random.best_params_)

Best Score: -0.12182114743635204
Best Hyperparameters: {'verbose': False, 'kernel': 'rbf', 'gamma': 'auto', 'degree': 1, 'C': 10}


In [25]:
# y_prediction
y_pred = results_random.predict(x_test)

In [33]:
# Error Metrics
mae = metrics.mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae}')
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')
f1_score = metrics.f1_score(y_test, y_pred)
print(f'F1-score: {f1_score}')
precision = metrics.precision_score(y_test, y_pred)
print(f'Precision: {precision}')
recall = metrics.recall_score(y_test, y_pred)
print(f'Recall: {recall}')

MAE: 0.11771713848073978
RMSE: 0.3430993128537855
F1-score: 0.6836407989022717
Precision: 0.8511769172361427
Recall: 0.5712101910828026


#### Model: Grid Search

In [29]:
%%time

# Model
svm_model = SVC()

grid = dict()

# Kernel type to be used in the algorithm
grid["kernel"] = ['poly', 'rbf', 'sigmoid']

# Degree of the polynomial kernel function
grid["degree"] = [1, 3, 8]

# Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’
grid["gamma"] = ['scale', 'auto']

# Regularisation parameter
grid["C"] = [0.01, 0.1, 1, 10]

# Enable verbose output
grid["verbose"] = [True, False]

# Define search
search = GridSearchCV(svm_model, grid, cv = 5, n_jobs = -1, verbose = 2)

# Execute search
results_grid = search.fit(x_train, np.ravel(y_train))

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[LibSVM]Wall time: 21h 20min 21s


In [30]:
# Save the model to disk
filename = '04 SVM (Grid).sav'
pickle.dump(results_grid, open(filename, 'wb'))

# Load the model from disk
# results_grid = pickle.load(open(filename, 'rb'))

print('Best Score: %s' % results_grid.best_score_)
print('Best Hyperparameters: %s' % results_grid.best_params_)

Best Score: 0.9191546698815687
Best Hyperparameters: {'C': 10, 'degree': 8, 'gamma': 'scale', 'kernel': 'poly', 'verbose': True}


In [31]:
# y_prediction
y_pred2 = results_grid.predict(x_test)

In [34]:
# Error Metrics
mae = metrics.mean_absolute_error(y_test, y_pred2)
print(f'MAE: {mae}')
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred2))
print(f'RMSE: {rmse}')
f1_score = metrics.f1_score(y_test, y_pred2)
print(f'F1-score: {f1_score}')
precision = metrics.precision_score(y_test, y_pred)
print(f'Precision: {precision}')
recall = metrics.recall_score(y_test, y_pred)
print(f'Recall: {recall}')

MAE: 0.07443127021047256
RMSE: 0.27282094899489034
F1-score: 0.8118187033849684
Precision: 0.8511769172361427
Recall: 0.5712101910828026


#### Model: Optimal Hyperparameters

In [35]:
# Model
# Best Hyperparameters (Random): {'verbose': False, 'kernel': 'rbf', 'gamma': 'auto', 'degree': 1, 'C': 10}
# Best Hyperparameters (Grid): {'C': 10, 'degree': 8, 'gamma': 'scale', 'kernel': 'poly', 'verbose': True}
# random_state = 123
svm_regression = SVC(C = 10, degree = 8, gamma = 'scale', kernel = 'poly', verbose = True, random_state = 123)
svm_model = svm_regression.fit(x_train, np.ravel(y_train))

# Save the model to disk
filename = '04 SVM (Optimal).sav'
pickle.dump(svm_model, open(filename, 'wb'))

# Load the model from disk
# rf_regression = pickle.load(open(filename, 'rb'))

[LibSVM]

In [37]:
# y_prediction
y_pred3 = svm_model.predict(x_test)

In [38]:
# Error Metrics
mae = metrics.mean_absolute_error(y_test, y_pred3)
print(f'MAE: {mae}')
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred3))
print(f'RMSE: {rmse}')
f1_score = metrics.f1_score(y_test, y_pred3)
print(f'F1-score: {f1_score}')
precision = metrics.precision_score(y_test, y_pred)
print(f'Precision: {precision}')
recall = metrics.recall_score(y_test, y_pred)
print(f'Recall: {recall}')

MAE: 0.07443127021047256
RMSE: 0.27282094899489034
F1-score: 0.8118187033849684
Precision: 0.8511769172361427
Recall: 0.5712101910828026


#### Feature Importance

In [40]:
# rf_model.feature_importances_
feature = pd.DataFrame(svm_model.coef_, x_train.columns.T, columns=['Feature Importance']).sort_values(by='Feature Importance', ascending=False)
feature

AttributeError: coef_ is only available when using a linear kernel

In [None]:
feature_chart(feature, "Random Forest")