In [64]:
                        # Importing needed packages
    
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Reading .csv file
    
################################################################################################################
                ##### Filepaths will need to be changed #####
################################################################################################################

df_train = pd.read_csv("Files/raw/tweets-train.csv")
    
df_train = pd.read_csv("Files/raw/tweets-test.csv")

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Creating custom cleaner function

nlp = spacy.load("en_core_web_sm")

punctuations = string.punctuation
# Create list of punctuation marks

def spacy_cleaner(sentence):
    
    #print("Input sentence:\n", sentence,"\n")
    
    doc = nlp(sentence.strip())
    # Pass text into model's pipeline.
    
    myTokens = [token for token in doc]
    # Creating a list of the words in the sentence.
    #print("Sentence tokenised:\n", myTokens,"\n")
    
    myTokens = [token for token in myTokens if token.is_stop == False and token.text not in punctuations]
    # List of words without stopwords or punctuations.
    #print("Sentence without stopwords or punctuations:\n", myTokens, "\n")
    
    myTokens = [token.lemma_.strip().lower() if token.pos_ != "PROPN" else token.lemma_.strip() \
                for token in myTokens]
    # Words are lemmatised, spaces at end removed and (if not a proper noun) lowercased.
    
    myTokens = [token for token in myTokens if token != ""]
    
    #print("Sentence lemmatisted, no spaces and lowercase (except Proper Noun):\n", myTokens, "\n")
    
    return myTokens

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Creating Bag-of-Words Vectoriser

bow_vector = CountVectorizer(tokenizer = spacy_cleaner, ngram_range=(1,1))

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Splitting into Training and Testing sets

X_train = df_train['text']
Y_train = df_train['sentiment']

X_test = df_test['text'] # 'text' is what we want to analyse
Y_test = df_test['sentiment'] # 'sentiment' is the label/answer to test against

# Below needed if splitting one .csv file into training and testing sets
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Building OVR Logisitic Regression Classifier
    
ovr = OneVsRestClassifier(LogisticRegression())

pipe = Pipeline([('vectorizer', bow_vector)
                 ,('classifier', ovr)])

pipe.fit(X_train, Y_train)

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Evaluating the model

predicted = pipe.predict(X_test)

# Evaluation
print("Confirmation that it works:\n")
print("Logistic Regression Accuracy:\n",metrics.accuracy_score(Y_test, predicted)) # Accuracy
print("Logistic Regression Precision:\n",metrics.precision_score(Y_test, predicted, average='macro')) # Precision
print("Logistic Regression Recall:\n",metrics.recall_score(Y_test, predicted, average='macro')) # Recall

Confirmation that it works:

Logistic Regression Accuracy:
 0.9328478964401294
Logistic Regression Precision:
 0.9349269088399522
Logistic Regression Recall:
 0.9315204145922483
