In [1]:
#link google drive
!pip install -U -q PyDrive 
  
from pydrive.auth import GoogleAuth 
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials   

# Authenticate and create the PyDrive client. 
auth.authenticate_user() 
gauth = GoogleAuth() 
gauth.credentials = GoogleCredentials.get_application_default() 
drive = GoogleDrive(gauth)

In [2]:
#get the csv file from google drive
link = 'https://drive.google.com/file/d/1dTIWNpjlrnTQBIQtaGOh0jCRYZiAQO79/view'

# to get the id part of the file 
id = link.split('/')[-2] 

downloaded = drive.CreateFile({'id':id})  
downloaded.GetContentFile('SentimentTweets.csv')

In [3]:
#read from csv and store data
import pandas as pd 

df = pd.read_csv('SentimentTweets.csv',
                encoding = 'ISO-8859-1',
                usecols=['target', 'text'],
                dtype='unicode')

In [4]:
df1 = df.sample(n=10000)
features = df1['text'].values
labels = df1['target'].values
#features = df['text'].values
#labels = df['target'].values

In [None]:
#We use spacy for lemmatization alongside with lemminflect extension. We also use spacy for removing stopwords
!pip3 install lemminflect
import re
import string
import nltk 
import spacy
import lemminflect
import en_core_web_sm
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('words')

sp = spacy.load('en_core_web_sm')
all_stopwords = sp.Defaults.stop_words
words = set(nltk.corpus.words.words())

processed_features = []

# load en_core_web_sm of English for vocabulary, syntax & entities
nlp = en_core_web_sm.load()

def sentence_lemmatization(my_doc, processed_feature):
    lemma_words = []
    for word in my_doc:
        lemma_words.append(word._.lemma())
    return ' '.join(lemma_words)

def remove_stopwords(processed_feature):
    text_tokens = word_tokenize(processed_feature)
    filtered_words = [word for word in text_tokens if not word in all_stopwords]
    return ' '.join(filtered_words)

def removeNonEnglish(processed_feature):
    return ' '.join(word for word in nltk.wordpunct_tokenize(processed_feature) if word.lower() in words or not word.isalpha())

for sentence in features:
    # Remove Usernames and Hashtags
    processed_feature = ' '.join(word for word in sentence.split() if not word.startswith('@'))
    processed_feature = ' '.join(word for word in processed_feature.split() if not word.startswith('#'))

    # Remove stopwords
    processed_feature = remove_stopwords(processed_feature)

    # Remove punctuation
    processed_feature = processed_feature.translate(str.maketrans('', '', string.punctuation))

    # Remove urls
    processed_feature = re.sub(r'http\S+|www\S+|https\S+', '', processed_feature)
    
    # Remove all the special characters and numbers
    processed_feature = re.sub(r'\W|\d+', ' ', processed_feature)

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Remove non-Ascii characters
    processed_feature = re.sub(r'[^\x00-\x7F]+',' ', processed_feature)

    # Remove single space from the beginning
    processed_feature = processed_feature.strip()

    # "nlp" Object is used to create documents with linguistic annotations.
    my_doc = nlp(processed_feature)

    # Use spacy to lemmatize the sentences
    processed_feature = sentence_lemmatization(my_doc, processed_feature)

    processed_feature = removeNonEnglish(processed_feature)

    # Remove -PROP- after lemmatization
    processed_feature = re.sub(r'-PROP-', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [7]:
#Use TfidfVectorizer to vectorize data
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer (max_df=1.0, min_df=1, max_features=10000)
processed_features = vectorizer.fit_transform(processed_features).toarray()

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.3, random_state=40)

In [9]:
#Model Development
from sklearn.linear_model import LogisticRegression

# default solver is incredibly slow thats why we change it
logreg = LogisticRegression(solver = 'lbfgs')

logreg.fit(X_train, y_train)
# make predictions on the testing set
y_pred = logreg.predict(X_test)

In [10]:
#Evaluation
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(y_test,y_pred))
print('Accuracy score:',accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82        17
           4       0.81      0.81      0.81        16

    accuracy                           0.82        33
   macro avg       0.82      0.82      0.82        33
weighted avg       0.82      0.82      0.82        33

Accuracy score: 0.8181818181818182
