In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix,f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle
import re

In [2]:
df = pd.read_csv("train_E6oV3lV.csv")
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
hate_tweet = df[df.label == 1]
hate_tweet.head()

Unnamed: 0,id,label,tweet
13,14,1,@user #cnn calls #michigan middle school 'buil...
14,15,1,no comment! in #australia #opkillingbay #se...
17,18,1,retweet if you agree!
23,24,1,@user @user lumpy says i am a . prove it lumpy.
34,35,1,it's unbelievable that in the 21st century we'...


In [4]:
df_Stat=df[['label','tweet']].groupby('label').count().reset_index()
df_Stat.columns=['label','count']
df_Stat['percentage']=(df_Stat['count']/df_Stat['count'].sum())*100
df_Stat

Unnamed: 0,label,count,percentage
0,0,29720,92.98542
1,1,2242,7.01458


In [5]:
def process_tweet(tweet):
    return " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ",tweet.lower()).split())

df['processed_tweets'] = df['tweet'].apply(process_tweet)
df.head()

Unnamed: 0,id,label,tweet,processed_tweets
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i can t use cause they ...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,factsguide society now motivation


In [6]:
#As this dataset is highly imbalance we have to balance this by over sampling
cnt_non_fraud = df[df['label'] == 0]['processed_tweets'].count()
df_class_fraud = df[df['label'] == 1]
df_class_nonfraud = df[df['label'] == 0]
df_class_fraud_oversample = df_class_fraud.sample(cnt_non_fraud, replace=True)
df_oversampled = pd.concat([df_class_nonfraud, df_class_fraud_oversample], axis=0)

print('Random over-sampling:')
print(df_oversampled['label'].value_counts())

Random over-sampling:
0    29720
1    29720
Name: label, dtype: int64


In [7]:
# Split data into training and test sets
from sklearn.model_selection import train_test_split
X = df_oversampled['processed_tweets']
y = df_oversampled['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = None)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
count_vect = CountVectorizer(stop_words='english')
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)

In [9]:
x_train_counts = count_vect.fit_transform(X_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)

<47552x34178 sparse matrix of type '<class 'numpy.int64'>'
	with 345322 stored elements in Compressed Sparse Row format>

In [10]:
print(x_train_counts.shape)
print(x_train_tfidf.shape)

(47552, 34178)
(47552, 34178)


In [11]:
x_test_counts = count_vect.transform(X_test)
x_test_tfidf = transformer.transform(x_test_counts)

In [12]:
from sklearn import svm
lin_clf = svm.LinearSVC()
lin_clf.fit(x_train_tfidf,y_train)

In [13]:
predict_svm = lin_clf.predict(x_test_tfidf)

predict_svm

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

In [14]:
y_test

7909     0
30167    0
21013    0
18416    1
19376    0
        ..
28527    1
14449    1
8183     1
10733    0
22031    1
Name: label, Length: 11888, dtype: int64

In [15]:
from sklearn.metrics import confusion_matrix,f1_score
print(confusion_matrix(y_test,predict_svm))
print(classification_report(y_test, predict_svm))

[[5731  159]
 [  21 5977]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      5890
           1       0.97      1.00      0.99      5998

    accuracy                           0.98     11888
   macro avg       0.99      0.98      0.98     11888
weighted avg       0.99      0.98      0.98     11888



In [38]:
test = [process_tweet("niceee!")]
testing_counts = count_vect.transform(test)
testing_tfidf = transformer.transform(testing_counts)

lin_clf.predict(testing_tfidf)

array([0], dtype=int64)

In [39]:
import pickle

model_file = open("model", "wb")
count_vect_file = open("count_vect", "wb")
transformer_file = open("transformer", "wb")

pickle.dump(lin_clf, model_file)
pickle.dump(count_vect, count_vect_file)
pickle.dump(transformer, transformer_file)

model_file.close()
count_vect_file.close()
transformer_file.close()