### Imports

In [1]:
import re
import pandas as pd
import numpy as np
import gdown
import spacy
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

### Download and Read Dataset

In [2]:
url = "https://drive.google.com/u/0/uc?id=1dTIWNpjlrnTQBIQtaGOh0jCRYZiAQO79&export=download"
filename = "SentimentTweets.csv"
gdown.download(url, filename, quiet=False)

Downloading...
From: https://drive.google.com/u/0/uc?id=1dTIWNpjlrnTQBIQtaGOh0jCRYZiAQO79&export=download
To: /home/vasilis/projects/ai2-1st-assignement/SentimentTweets.csv
186MB [00:36, 5.04MB/s]


'SentimentTweets.csv'

In [3]:
data = pd.read_csv("SentimentTweets.csv")

data.dropna(inplace = True)

### Convert every char to lowercase

In [4]:
data['text'] = data['text'].apply(lambda x: str(x).lower())

### Remove Special Chars

In [5]:
X = data['text']
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.28, random_state = 4, stratify = y)

### Use tfidf to convert text to vectors
- Use strip_accents to remove accents from characters
- Use stop_words to remove stop words
- By default (lowercase=True) convert every char to lowercase

In [6]:
tfidf = TfidfVectorizer(strip_accents='ascii', stop_words={'english'})

In [7]:
X_train = tfidf.fit_transform(X_train)

In [8]:
clf = LogisticRegression()

### Cross Validation

In [9]:
%%time
scores = cross_val_score(clf, X_train, y_train, cv = 10, n_jobs = 4)

CPU times: user 410 ms, sys: 229 ms, total: 639 ms
Wall time: 2min 54s


In [11]:
scores.mean()

0.8000486098371768

In [12]:
clf.fit(X_train, y_train)

LogisticRegression()

In [13]:
y_pred = clf.predict(tfidf.transform(X_test))

### Precision, recall and f1-score results

In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.79      0.80    179030
           4       0.80      0.81      0.80    179371

    accuracy                           0.80    358401
   macro avg       0.80      0.80      0.80    358401
weighted avg       0.80      0.80      0.80    358401

