In [1]:
import load_baseline_train_data
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.classification import accuracy_score,classification_report

In [2]:
df = load_baseline_train_data.get_user_tweet_data('training_user_tweet.csv')

In [3]:
def clean_tweet(x):
    #Create a string form of our list of text
    raw_string = x
    no_links = re.sub(r'http\S+', '', raw_string)
    no_unicode = re.sub(r"\\[a-z][a-z]?[0-9]+", '', no_links)
    no_special_characters = re.sub('[^A-Za-z ]+', '', no_unicode)
    words = no_special_characters.split(" ")
    words = [w for w in words if len(w) > 2]
    words = [w.lower() for w in words]
    stw = stopwords.words('english')
    words = [w for w in words if w not in stw]
    return ' '.join(words)




print('Cleaning tweet')
df['tweet'] = df['tweet'].apply(clean_tweet)


print('Tweet Cleaned')


Cleaning tweet
Tweet Cleaned


In [4]:
# TF-IDF Vectorizer
tfidf = TfidfVectorizer(
    min_df = 5,
    max_df = 0.95,
    max_features = 2000,
    stop_words = 'english'
)
print('Fitting TF-IDF :')
tfidf.fit(df.tweet)
text = tfidf.transform(df.tweet)
print('TF-IDF fitting complete')
print(text.shape)
print(df.shape)

Fitting TF-IDF :
TF-IDF fitting complete
(12143, 2000)
(12143, 28)


In [5]:
df= df.reset_index(drop=True)
X = df.drop(columns= ['tweet'])
Y = X['label']
X = X.drop(columns= ['label'])
X = pd.concat([X, pd.DataFrame(text.todense())], axis=1,ignore_index=True)
print(X.shape)
print(Y.shape)


(12143, 2026)
(12143,)


In [12]:
svclassifier_1fold = SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=True)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42,stratify=Y)
svclassifier_1fold.fit(X_train, y_train)


predictions = svclassifier_1fold.predict(X_test)
score = accuracy_score(y_test,predictions)
print("Classification Accuracy with Standard 1 fold Split:")
print(score)
print(classification_report(y_test,predictions))

[LibSVM]Classification Accuracy with Standard 1 fold Split:
0.8970769864141622
              precision    recall  f1-score   support

           0       0.71      0.98      0.83       606
           1       0.99      0.87      0.93      1823

    accuracy                           0.90      2429
   macro avg       0.85      0.93      0.88      2429
weighted avg       0.92      0.90      0.90      2429



In [10]:

# 5 fold Cross Validation Result
from sklearn.model_selection import cross_validate




scoring = {'acc': 'accuracy',
           'f1': 'f1'}
svclassifier = SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=True)

scores = cross_validate(svclassifier,X,Y,scoring=scoring,
                         cv=10, return_train_score=True)

print(scores.keys())
print(scores['test_acc']) 
#svclassifier.fit(X_train, y_train)


# predictions = svclassifier.predict(X_test)
# score = accuracy_score(y_test,predictions)
# print(score)
# print(classification_report(y_test,predictions))

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]dict_keys(['fit_time', 'score_time', 'test_acc', 'train_acc', 'test_f1', 'train_f1'])
[0.25020576 0.92839506 0.99176955 0.99506173 0.89785832 0.55189456
 0.83607908 0.98187809 0.99341021 0.55070074]


In [11]:
print('Average Test accuracy across 5 folds: ')
print(sum(scores['test_acc'])/len(scores['test_acc']))
print('Average F1 score(class 1) across 5 folds: ')
print(sum(scores['test_f1'])/len(scores['test_f1']))



Average Test accuracy across 5 folds: 
0.7977253104318942
Average F1 score(class 1) across 5 folds: 
0.7896353375147311
