# NLP Assignment 1 (40% of grade): Sentiment Analysis from Tweets

This coursework will involve you implementing functions for a text classifier, which you will train to identify the **sentiment expressed in a text** in a dataset of approx. 27,000 entries, which will be split into a 80%/20% training/test split.

In this template you are given the basis for that implementation, though some of the functions are missing, which you have to fill in.

Follow the instructions file **NLP_Assignment_1_Instructions.pdf** for details of each question - the outline of what needs to be achieved for each question is as below.

You must submit all **ipython notebooks and extra resources you need to run the code if you've added them** in the code submission, and a **2 page report (pdf)** in the report submission on QMPlus where you report your methods and findings according to the instructions file for each question.

# Questions 5: Optimising pre-processing and feature extraction (30 marks)

In [1]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
import numpy as np
import re

import csv
import numpy as np
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re


In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            (label, text) = parse_data_line(line)
            raw_data.append((text, label))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label) in raw_data[:num_training_samples]:
        train_data.append((to_feature_vector(pre_process(text)),label))
    for (text, label) in raw_data[num_training_samples:]:
        test_data.append((to_feature_vector(pre_process(text)),label))

In [3]:
def parse_data_line(data_line):
    # Should return a tuple of the label as just positive or negative and the statement
    # e.g. (label, statement)
    label, text = data_line[1], data_line[2]
    return (label, text)
    #return (None, None)

In [4]:
# Input: a string of one statement
def pre_process(text):
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Convert to lowercase and remove punctuation
    text = re.sub(r'[^\w\s]', '', text.lower())

    # Tokenization and lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]

    return tokens

In [5]:
global_feature_dict = {} # A global dictionary of features

def to_feature_vector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    # DESCRIBE YOUR METHOD IN WORDS
    feature_vector = {}
    for token in tokens:
        feature_vector[token] = 1  # Binary feature, 1 if the feature is present, 0 if it's not
        global_feature_dict[token] = global_feature_dict.get(token, 0) + 1  # Increment count in the global feature dictionary
    return feature_vector

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER

from sklearn.ensemble import RandomForestClassifier

def train_classifier(data):
    print("Training Classifier with Random Forest...")
    pipeline = Pipeline([('rfc', RandomForestClassifier())])  # Using RandomForestClassifier as the classifier
    return SklearnClassifier(pipeline).train(data)

In [7]:
#solution
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import KFold
import numpy as np

def cross_validate(dataset, folds):
    results = []
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)

    for train_index, test_index in kf.split(dataset):
        train_data = [dataset[i] for i in train_index]
        test_data = [dataset[i] for i in test_index]

        classifier = train_classifier(train_data)
        predicted_labels = predict_labels([data[0] for data in test_data], classifier)
        true_labels = [data[1] for data in test_data]

        report = classification_report(true_labels, predicted_labels, output_dict=True)
        results.append(report)

    # Calculate average scores over all folds
    avg_results = {
        'precision': np.mean([result['macro avg']['precision'] for result in results]),
        'recall': np.mean([result['macro avg']['recall'] for result in results]),
        'f1-score': np.mean([result['macro avg']['f1-score'] for result in results]),
        'accuracy': np.mean([result['accuracy'] for result in results])
    }

    return avg_results

In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predict_labels(samples, classifier):
    """Assuming preprocessed samples, return their predicted labels from the classifier model."""
    return classifier.classify_many(samples)

def predict_label_from_raw(sample, classifier):
    """Assuming raw text, return its predicted label from the classifier model."""

    return classifier.classify(to_feature_vector(pre_process(sample)))

In [15]:
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
raw_data = []          # the filtered data from the dataset file
train_data = []        # the pre-processed training data as a percentage of the total dataset
test_data = []         # the pre-processed test data as a percentage of the total dataset


# references to the data files
data_file_path = 'sentiment-dataset.tsv'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing the dataset...",sep='\n')

load_data(data_file_path)

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing training and test data...",sep='\n')

split_and_preprocess_data(0.8)

# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 33540 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 33540 rawData, 26832 trainData, 6708 testData
Training Samples: 
26832
Features: 
58934


In [16]:
cross_validate(train_data, 10)  # will work and output overall performance of p, r, f-score when cv implemented

Training Classifier with Random Forest...
Training Classifier with Random Forest...
Training Classifier with Random Forest...
Training Classifier with Random Forest...
Training Classifier with Random Forest...
Training Classifier with Random Forest...
Training Classifier with Random Forest...
Training Classifier with Random Forest...
Training Classifier with Random Forest...
Training Classifier with Random Forest...


{'precision': 0.838961586787528,
 'recall': 0.7821519872856413,
 'f1-score': 0.7993711744713882,
 'accuracy': 0.8319170546127769}