In [1]:

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer  #lemmatization
from nltk.stem import PorterStemmer   #stemming
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import re

In [2]:

class data_preprocessing:
    def read_train_data(self,train_file_name):
        data=pd.read_csv(train_file_name)
        #data=np.array(data)
        x = data.drop('labels',axis = 1)
        x = np.array(x)
        y = data.labels
        return x,y
        
    def clean_data(self,data):
        data_1=[]
        for tuple in data:
            str_reg=re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', tuple[0])
            data_1.append(str_reg)
            
        data_2=[]
        emoj = re.compile("["
                u"\U0001F600-\U0001F64F"  # emoticons
                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                u"\U00002500-\U00002BEF"  # chinese char
                u"\U00002702-\U000027B0"
                u"\U00002702-\U000027B0"
                u"\U000024C2-\U0001F251"
                u"\U0001f926-\U0001f937"
                u"\U00010000-\U0010ffff"
                u"\u2640-\u2642" 
                u"\u2600-\u2B55"
                u"\u200d"
                u"\u23cf"
                u"\u23e9"
                u"\u231a"
                u"\ufe0f"  # dingbats
                u"\u3030"
                              "]+", re.UNICODE)
        for tuple in data_1:
            str_reg=re.sub('[!@#$,.-;&?]', ' ',tuple)  #('[!@#$,.\'-;&]')  , (r'[^\w]')
            str_reg=str_reg.replace('\'','')
            str_reg=str_reg.replace('’','')
            str_reg=re.sub(emoj,' ',str_reg)
            data_2.append(str_reg)
        
        stop_words = set(stopwords.words('english'))
        data_3=[]
        ps = PorterStemmer() #stemming
        lemmatizer = WordNetLemmatizer()  #lemmatization
        for tuple in data_2:
            filtered_sentence=[]
            word_tokens=word_tokenize(tuple)
            for w in word_tokens:
                if w not in stop_words:
                    w=ps.stem(w)
                    w=lemmatizer.lemmatize(w)
                    filtered_sentence.append(w)
            data_3.append(filtered_sentence)
        
        clean_data=[' '.join(i) for i in data_3]
        return clean_data
    
    def strToVec(self,clean_data):
        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform(clean_data)
        return vectors



In [3]:

obj1=data_preprocessing()
data,labels=obj1.read_train_data("train_data.csv")
clean_train_data=obj1.clean_data(data)
train_data=obj1.strToVec(clean_train_data)

# Logistic regression

Lets first split data and check

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_data, labels, random_state=100)

In [6]:
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [7]:
pred=logisticRegr.predict(x_test)

In [8]:
from sklearn.metrics import accuracy_score

In [9]:
accuracy_score(y_test, pred, normalize=True)

0.6735003796507213

Lets load test data and predict labels

In [10]:
obj2=data_preprocessing()
test_data_loaded=pd.read_csv("test_data.csv")
test_data_loaded=np.array(test_data_loaded)
train_data_loaded,labels=obj2.read_train_data("train_data.csv")


In [11]:
clean_test_data=obj2.clean_data(test_data_loaded)
clean_train_data=obj2.clean_data(train_data_loaded)
print(len(clean_test_data))
print(len(clean_train_data))

586
5266


We fit only train data, and we trasform both train and test data

In [12]:
tf = TfidfVectorizer()
tf.fit(clean_train_data)
train_data = tf.transform(clean_train_data)
test_data = tf.transform(clean_test_data)

In [13]:
print(train_data.shape)
print(test_data.shape)

(5266, 12712)
(586, 12712)


Lets now train on whole train data

In [14]:
logisticRegr = LogisticRegression()
logisticRegr.fit(train_data, labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
y_pred_lg=logisticRegr.predict(test_data)

In [16]:
print(y_pred_lg)

[1 0 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0
 0 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 0 1 0 1
 1 0 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1
 0 1 0 1 0 1 1 1 1 0 1 1 0 0 0 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 0 1 1 1 1 1 0 0 1 0 1 0 1 1 0 1 1 1 1
 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1
 1 0 0 1 0 1 1 0 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 0 0 0 1 1 0 1 0 0 1 1 0 1
 1 1 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 1 1 1 1 0 0 1 1 0 0 1 0 1 0 0 1 1 1 0 1
 1 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 0 0 0 1 1 1 0 1 0 1 1 1 1 0
 1 0 0 1 1 1 0 1 1 0 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 0 1 1 0 1 0 1 0 1 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 0 1
 1 1 0 0 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1
 1 0 1 0 0 0 1 1 0 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1
 1 0 1 0 1 0 1 1 1 1 1 1 

# SVM

Lets split train data and find accuracy

In [17]:

from sklearn import svm

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_data, labels, random_state=100)

In [19]:
clf = svm.SVC(kernel='linear')
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
accuracy_score(y_test, y_pred, normalize=True)

0.6772968868640851

Lets load test data and predict labels


In [22]:

obj3=data_preprocessing()
test_data_loaded=pd.read_csv("test_data.csv")
test_data_loaded=np.array(test_data_loaded)
train_data_loaded,labels=obj3.read_train_data("train_data.csv")

In [23]:

clean_test_data=obj3.clean_data(test_data_loaded)
clean_train_data=obj3.clean_data(train_data_loaded)
print(len(clean_test_data))
print(len(clean_train_data))

586
5266


We fit only train data, and we trasform both train and test data

In [24]:

tf = TfidfVectorizer()
tf.fit(clean_train_data)
train_data = tf.transform(clean_train_data)
test_data = tf.transform(clean_test_data)

In [25]:

print(train_data.shape)
print(test_data.shape)

(5266, 12712)
(586, 12712)


Lets now train on whole train data

In [26]:

clf = svm.SVC(kernel='linear')
clf.fit(train_data, labels)
y_pred_svm = clf.predict(test_data)

In [28]:
print(y_pred_svm)

[1 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 0 1 1 1 0
 0 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 0 1 0 1 0 1 1 1 0 1 0 0 1 0 1
 1 0 0 0 1 0 0 1 0 0 1 1 0 0 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 0 1 1 1 1
 0 1 0 1 0 1 1 1 1 1 0 1 0 0 0 1 0 0 1 1 1 1 0 1 0 1 1 1 1 1 0 1 0 1 1 1 1
 0 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 0 0 1 1 1 1 1 0 0 1 0 1 0 1 1 0 1 1 1 1
 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1
 1 0 0 1 0 1 1 0 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 0 0 0 0 1 0 1 1 1 1 1 0 1
 1 1 1 0 1 1 0 0 1 1 1 1 1 0 0 0 1 1 1 1 1 0 0 0 1 0 0 1 0 1 0 0 1 1 1 0 1
 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 0 1 1 1 0 1 0 1 1 1 1 0
 1 0 0 1 1 1 0 0 0 0 0 1 0 1 1 1 0 0 1 0 1 0 0 1 1 1 0 1 1 0 1 0 1 0 1 1 1
 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 0 1
 1 1 0 0 1 1 1 0 1 0 1 1 1 0 1 1 0 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1
 1 0 1 0 0 0 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1
 1 0 1 0 1 1 0 1 0 1 1 1 