<h1><center>Random Forest Models</center></h1>

<hr>

# Bag-of-Words Vectoriser
## Standard Metrics

In [6]:
import requests
import io
import spacy
import pandas as pd
import re
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import string
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import nltk
from sklearn.pipeline import Pipeline

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Reading Github csv file

# Training csv
url_train = "https://raw.githubusercontent.com/SoniaLei/nlp-web-scrapping/development/data/raw/tweets-train.csv"

csv_train = requests.get(url_train).content

df_train = pd.read_csv(io.StringIO(csv_train.decode('utf-8')))

X_train = df_train['text'].astype(str)
Y_train = df_train['sentiment'].astype(str)

# Testing csv
url_test = "https://raw.githubusercontent.com/SoniaLei/nlp-web-scrapping/development/data/raw/tweets-test.csv"

csv_test = requests.get(url_test).content

df_test = pd.read_csv(io.StringIO(csv_test.decode('utf-8')))

X_test = df_test['text'].astype(str)
Y_test = df_test['sentiment'].astype(str)

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Creating custom tokeniser and cleaning function

Lemmatiser = nltk.stem.WordNetLemmatizer()
# Instantiating the NLTK Lemmatiser

punctuations = string.punctuation
# Putting punctuation symbols into an object

nlp = spacy.load("en_core_web_sm")
# Import spacy model

stopwords = spacy.lang.en.stop_words.STOP_WORDS
# A list of stopwords that can be filtered out
    # NLTK also has a stop words object but it has fewer words

def text_cleaner(sentence):    
                
    sentence = "".join([char for char in sentence.strip() if char not in punctuations])
    # Getting rid of any punctuation characters
    
    myTokens = re.split('\W+', sentence)
    # Tokenising the words
    
    myTokens = [token.lower() for token in myTokens if token not in stopwords]
    # Removing stop words
    
    myTokens = [Lemmatiser.lemmatize(token) for token in myTokens]
    # Lemmatising the words and putting in lower case except for proper nouns
    
    return myTokens    

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Creating Vectoriser and Classifiers

bow_vector = CountVectorizer(tokenizer = text_cleaner, ngram_range=(1,1))

rfc = RandomForestClassifier(n_jobs=-1)

pipe = Pipeline([('vectorizer', bow_vector)
                 ,('classifier', rfc)])

pipe.fit(X_train, Y_train)

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Evaluating the model

predicted = pipe.predict(X_test)

# Evaluation
print("RAW DATA - BoW, Metrics:\n")
print("Logistic Regression Accuracy:\n",metrics.accuracy_score(Y_test, predicted),"\n") # Accuracy
print("Logistic Regression Precision:\n",metrics.precision_score(Y_test, predicted, average='macro'),"\n") # Precision
print("Logistic Regression Recall:\n",metrics.recall_score(Y_test, predicted, average='macro'),"\n") # Recall
print("Logistic Regression F1 Score:\n",metrics.f1_score(Y_test, predicted, average='macro')) # F1 Score

RAW DATA - BoW, Metrics:

Logistic Regression Accuracy:
 0.6977928692699491 

Logistic Regression Precision:
 0.7069249683828303 

Logistic Regression Recall:
 0.6930694871764681 

Logistic Regression F1 Score:
 0.6981947460447407


<hr>

# Bag-of-Words Vectoriser
## K-Fold Evaluation

In [9]:
import requests
import io
import spacy
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import string
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import nltk
from sklearn.pipeline import Pipeline

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Reading Github csv file

url = "https://raw.githubusercontent.com/SoniaLei/nlp-web-scrapping/development/data/raw/tweets-train.csv"

csv = requests.get(url_train).content

df = pd.read_csv(io.StringIO(csv_train.decode('utf-8')))

X = df['text'].astype(str)
# Convert to type 'string' as pandas converts inputs to their most relevant type
    # The issue is sometimes pandas converts data to a 'float'; this doesn't work with the evaluation functions
    
Y = df_train['sentiment'].astype(str)

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Creating custom tokeniser and cleaning function

Lemmatiser = nltk.stem.WordNetLemmatizer()
# Instantiating the NLTK Lemmatiser

punctuations = string.punctuation
# Putting punctuation symbols into an object

nlp = spacy.load("en_core_web_sm")
# Import spacy model

stopwords = spacy.lang.en.stop_words.STOP_WORDS
# A list of stopwords that can be filtered out
    # NLTK also has a stop words object but it has fewer words

def text_cleaner(sentence):    
                
    sentence = "".join([char for char in sentence.strip() if char not in punctuations])
    # Getting rid of any punctuation characters
    
    myTokens = re.split('\W+', sentence)
    # Tokenising the words
    
    myTokens = [token.lower() for token in myTokens if token not in stopwords]
    # Removing stop words
    
    myTokens = [Lemmatiser.lemmatize(token) for token in myTokens]
    # Lemmatising the words and putting in lower case except for proper nouns
    
    return myTokens    

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Creating Vectoriser and Classifiers

bow_vector = CountVectorizer(tokenizer = text_cleaner, ngram_range=(1,1))

rfc = RandomForestClassifier(n_jobs=-1)

pipe = Pipeline([('vectorizer', bow_vector)
                 ,('classifier', rfc)])

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Evaluating the model

k_fold = KFold(n_splits=5)
# 5 splits

print("RAW DATA - BoW, K-Fold:\n")
print(cross_val_score(estimator=pipe, X=X, y=Y, cv=k_fold, scoring='accuracy', n_jobs=-1))

RAW DATA - BoW, K-Fold:

[0.69055849 0.69195779 0.6977802  0.69796215 0.70178311]


<hr>

# TF-IDF Vectoriser
## Standard Metrics

In [7]:
import requests
import io
import spacy
import pandas as pd
import re
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import string
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import nltk
from sklearn.pipeline import Pipeline

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Reading Github csv file

# Training csv
url_train = "https://raw.githubusercontent.com/SoniaLei/nlp-web-scrapping/development/data/raw/tweets-train.csv"

csv_train = requests.get(url_train).content

df_train = pd.read_csv(io.StringIO(csv_train.decode('utf-8')))

X_train = df_train['text'].astype(str)
Y_train = df_train['sentiment'].astype(str)

# Testing csv
url_test = "https://raw.githubusercontent.com/SoniaLei/nlp-web-scrapping/development/data/raw/tweets-test.csv"

csv_test = requests.get(url_test).content

df_test = pd.read_csv(io.StringIO(csv_test.decode('utf-8')))

X_test = df_test['text'].astype(str)
Y_test = df_test['sentiment'].astype(str)

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Creating custom tokeniser and cleaning function

Lemmatiser = nltk.stem.WordNetLemmatizer()
# Instantiating the NLTK Lemmatiser

punctuations = string.punctuation
# Putting punctuation symbols into an object

nlp = spacy.load("en_core_web_sm")
# Import spacy model

stopwords = spacy.lang.en.stop_words.STOP_WORDS
# A list of stopwords that can be filtered out
    # NLTK also has a stop words object but it has fewer words

def text_cleaner(sentence):    
                
    sentence = "".join([char for char in sentence.strip() if char not in punctuations])
    # Getting rid of any punctuation characters
    
    myTokens = re.split('\W+', sentence)
    # Tokenising the words
    
    myTokens = [token.lower() for token in myTokens if token not in stopwords]
    # Removing stop words
    
    myTokens = [Lemmatiser.lemmatize(token) for token in myTokens]
    # Lemmatising the words and putting in lower case except for proper nouns
    
    return myTokens    

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Creating Vectoriser and Classifiers

tfidf_vector = TfidfVectorizer(tokenizer = text_cleaner)

rfc = RandomForestClassifier(n_jobs=-1)

pipe = Pipeline([('vectorizer', tfidf_vector)
                 ,('classifier', rfc)])

pipe.fit(X_train, Y_train)

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Evaluating the model

predicted = pipe.predict(X_test)

# Evaluation
print("RAW DATA - TF-IDF, Metrics:\n")
print("Logistic Regression Accuracy:\n",metrics.accuracy_score(Y_test, predicted),"\n") # Accuracy
print("Logistic Regression Precision:\n",metrics.precision_score(Y_test, predicted, average='macro'),"\n") # Precision
print("Logistic Regression Recall:\n",metrics.recall_score(Y_test, predicted, average='macro'),"\n") # Recall
print("Logistic Regression F1 Score:\n",metrics.f1_score(Y_test, predicted, average='macro')) # F1 Score

RAW DATA - TF-IDF, Metrics:

Logistic Regression Accuracy:
 0.7113752122241087 

Logistic Regression Precision:
 0.7230802481043201 

Logistic Regression Recall:
 0.7064403411638226 

Logistic Regression F1 Score:
 0.712485901766117


<hr>

# TF-IDF Vectoriser
## K-Fold Evaluation

In [8]:
import requests
import io
import spacy
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import string
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import nltk
from sklearn.pipeline import Pipeline

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Reading Github csv file

url = "https://raw.githubusercontent.com/SoniaLei/nlp-web-scrapping/development/data/raw/tweets-train.csv"

csv = requests.get(url_train).content

df = pd.read_csv(io.StringIO(csv_train.decode('utf-8')))

X = df['text'].astype(str)
# Convert to type 'string' as pandas converts inputs to their most relevant type
    # The issue is sometimes pandas converts data to a 'float'; this doesn't work with the evaluation functions
    
Y = df_train['sentiment'].astype(str)

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Creating custom tokeniser and cleaning function

Lemmatiser = nltk.stem.WordNetLemmatizer()
# Instantiating the NLTK Lemmatiser

punctuations = string.punctuation
# Putting punctuation symbols into an object

nlp = spacy.load("en_core_web_sm")
# Import spacy model

stopwords = spacy.lang.en.stop_words.STOP_WORDS
# A list of stopwords that can be filtered out
    # NLTK also has a stop words object but it has fewer words

def text_cleaner(sentence):    
                
    sentence = "".join([char for char in sentence.strip() if char not in punctuations])
    # Getting rid of any punctuation characters
    
    myTokens = re.split('\W+', sentence)
    # Tokenising the words
    
    myTokens = [token.lower() for token in myTokens if token not in stopwords]
    # Removing stop words
    
    myTokens = [Lemmatiser.lemmatize(token) for token in myTokens]
    # Lemmatising the words and putting in lower case except for proper nouns
    
    return myTokens    

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Creating Vectoriser

tfidf_vector = TfidfVectorizer(tokenizer = text_cleaner)

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

                        # Evaluating the model
    
rfc = RandomForestClassifier(n_jobs=-1)

pipe = Pipeline([('vectorizer', tfidf_vector)
                 ,('classifier', rfc)])

k_fold = KFold(n_splits=5)
# 5 splits

print("RAW DATA - TF-IDF, K-Fold:\n")
print(cross_val_score(estimator=pipe, X=X, y=Y, cv=k_fold, scoring='accuracy', n_jobs=-1))

RAW DATA - TF-IDF, K-Fold:

[0.69401492 0.69796215 0.70196507 0.6977802  0.70487627]
