In [0]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [0]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [0]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

In [0]:
!pip3 install tweet-preprocessor
!pip3 install wordninja

# Loading train data

In [0]:
data = pd.read_csv('/content/drive/My Drive/SMAI_Final_Assignment/Q1/final_train.csv')
X = data['text']
y = data['labels']
X = X.to_numpy()
y = y.to_numpy()
# print(X)
# print(y)

# The sections below show different methods for preprocessing of data

## In this section we are removing the special characters and extra spaces from the data.

In [0]:
import re
no_space = re.compile("(&(\w*))|(@(\w*))|(\;)|(\')|(#)|(\.)|(\;)|(\:)|(\!)|(\*)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
space = re.compile("(\-)|(\/)")
single_digits = re.compile(r"\b[A-Za-z]\b")
digits = re.compile("\d+")
extra_spaces = re.compile(r'\s+')
backslash = re.compile(r'\\')

def preprocess_reviews(tweet):
    
    tweet = [no_space.sub("", line.lower()) for line in tweet]
    tweet = [space.sub(" ", line) for line in tweet]
    tweet = [single_digits.sub(" ", line) for line in tweet]
    tweet = [digits.sub(" ", line) for line in tweet]
    tweet = [backslash.sub(" ", line) for line in tweet]
    tweet = [extra_spaces.sub(" ", line) for line in tweet]
    
    return tweet

## Performing lemmatization

In [0]:
from nltk.stem import WordNetLemmatizer
def get_lemmatized_text(corpus):
    lemmatizer = WordNetLemmatizer()
    corpus = [' '.join([lemmatizer.lemmatize(word) for word in line.split()]) for line in corpus]
    return corpus

## In this section we are removing all URLs, emojis, numbers and mention tags from the data.

In [0]:
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER, p.OPT.MENTION)
i = 0
for line in X:
    cleaned_line = p.clean(line)
    X[i] = cleaned_line
    i = i+1


## Using the wordninja module, we are splitting the data into seperate words.

In [0]:
import wordninja
i = 0
for text in X:
    arr = wordninja.split(text)
    arr1 = ""
    for j in arr:
        arr1 = arr1+" "+j
    X[i] = arr1
    i = i+1

In [0]:
X = preprocess_reviews(X)
# X

In [0]:
# X = remove_stop_words(X)
# X

In [0]:
X = get_lemmatized_text(X)

In [0]:
# vectorizer = TfidfVectorizer(stop_words = 'english', ngram_range=(1, 3))
vectorizer = CountVectorizer(stop_words = 'english', ngram_range=(1, 2))
X = vectorizer.fit_transform(X)

In [13]:
X_train = X
y_train = y
X_train

<5266x58165 sparse matrix of type '<class 'numpy.int64'>'
	with 147176 stored elements in Compressed Sparse Row format>

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X , y, train_size = 0.80)

# Logistic Regression model

In [15]:
from sklearn.linear_model import LogisticRegression

c = [0.001, 0.01, 0.05, 0.5, 0.1]
for i in c:
    model_lr = LogisticRegression(C=i)
    model_lr.fit(X_train, y_train)
    y_pred = model_lr.predict(X_test)
    accuracy_lr = accuracy_score(y_test, y_pred)
    f1_lr = f1_score(y_test, y_pred)
    print ("C = ", i, " Accuracy: ", accuracy_lr)
    print ("C = ", i, " F1 score: ", f1_lr)

C =  0.001  Accuracy:  0.627134724857685
C =  0.001  F1 score:  0.7667655786350149
C =  0.01  Accuracy:  0.6574952561669829
C =  0.01  F1 score:  0.7705022250476795
C =  0.05  Accuracy:  0.6698292220113852
C =  0.05  F1 score:  0.7661290322580644
C =  0.5  Accuracy:  0.6650853889943074
C =  0.5  F1 score:  0.7494677075940384
C =  0.1  Accuracy:  0.674573055028463
C =  0.1  F1 score:  0.7658703071672355


# Linear SVC model

In [16]:
from sklearn.svm import LinearSVC

c = [0.001, 0.01, 0.05, 0.5, 0.1]
for i in c:
    model_svm = LinearSVC(C=i)
    model_svm.fit(X_train, y_train)
    y_pred = model_svm.predict(X_test)
    accuracy_svm = accuracy_score(y_test, y_pred)
    f1_svm = f1_score(y_test, y_pred)
    print ("C = ", i, " Accuracy: ", accuracy_svm)
    print ("C = ", i, " F1 score: ", f1_svm)

C =  0.001  Accuracy:  0.6584440227703985
C =  0.001  F1 score:  0.7692307692307693
C =  0.01  Accuracy:  0.6726755218216319
C =  0.01  F1 score:  0.765146358066712
C =  0.05  Accuracy:  0.6669829222011385
C =  0.05  F1 score:  0.7505330490405118
C =  0.5  Accuracy:  0.6527514231499051
C =  0.5  F1 score:  0.7332361516034985
C =  0.1  Accuracy:  0.6565464895635673
C =  0.1  F1 score:  0.7391930835734869


# SVM with 'linear' kernel

In [17]:
svm = SVC(kernel = 'linear')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred)
f1_svm = f1_score(y_test, y_pred)
print ("Accuracy: ", accuracy_svm)
print (" F1 score: ", f1_svm)

Accuracy:  0.6480075901328273
 F1 score:  0.7278063096111519


# SVM with 'poly' kernel

In [18]:
svm = SVC(kernel = 'poly')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred)
f1_svm = f1_score(y_test, y_pred)
print ("Accuracy: ", accuracy_svm)
print (" F1 score: ", f1_svm)

Accuracy:  0.6166982922201139
 F1 score:  0.7566265060240964


# SVM with 'rbf' kernel

In [19]:
svm = SVC(kernel = 'rbf')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred)
f1_svm = f1_score(y_test, y_pred)
print ("Accuracy: ", accuracy_svm)
print (" F1 score: ", f1_svm)

Accuracy:  0.6669829222011385
 F1 score:  0.772225827384815


# Test data

In [0]:
X_test = pd.read_csv('/content/drive/My Drive/SMAI_Final_Assignment/Q1/final_test.csv')
X_test = X_test['text']
X_test = X_test.to_numpy()
X_test1 = X_test

In [0]:
X_test = X_test1

p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER, p.OPT.MENTION)
i = 0
for line in X_test:
#     print(line)
    cleaned_tweet = p.clean(line)
    X_test[i] = cleaned_tweet
    i = i+1


i = 0
for text in X_test:
    arr = wordninja.split(text)
    arr1 = ""
    for j in arr:
        arr1 = arr1+" "+j
    X_test[i] = arr1
    i = i+1

X_test = preprocess_reviews(X_test)

# X_test = remove_stop_words(X_test)

X_test = get_lemmatized_text(X_test)
X_test = vectorizer.transform(X_test)
X_test

<1153x58165 sparse matrix of type '<class 'numpy.int64'>'
	with 16256 stored elements in Compressed Sparse Row format>

In [0]:
svm = SVC(kernel = 'rbf')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
# y_pred

# Saving predictions to csv file

In [0]:
df = pd.DataFrame()
df['labels'] = y_pred.tolist()
df.to_csv("submissionq1.csv")

In [0]:
!cp submissionq1.csv "drive/My Drive/smai"