Optimising pre-processing and feature extraction

In [1]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
import numpy as np
import nltk
import pandas as pd
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            (label, text) = parse_data_line(line)
            raw_data.append((text, label))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label) in raw_data[:num_training_samples]:
        train_data.append((to_feature_vector(pre_process(text)),label))
    for (text, label) in raw_data[num_training_samples:]:
        test_data.append((to_feature_vector(pre_process(text)),label))

In [3]:
def parse_data_line(line):
    if len(line) >= 2:
        label= line[1]
        statement =line[2]  
    else:
        
        label, statement = None, None  
    return label, statement


In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

stop_words_function = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def pre_process(text):
    # Separate punctuation and tokenize
    text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text)
    text = re.sub(r"([.,;:!?'\"“\(\)])(\w)", r"\1 \2", text)
    tokens = nltk.word_tokenize(text)
    
    # Lemmatization and stop word removal
    tokens = [lemmatizer.lemmatize(t.lower()) for t in tokens if t.lower() not in stop_words]

    return tokens
def extract_pos_tags(tokens):
    return [pos for _, pos in nltk.pos_tag(tokens)]

# import spacy

# nlp = spacy.load("en_core_web_sm")

# def extract_dependency_relations(text):
#     doc = nlp(text)
#     return [(token.text, token.dep_) for token in doc]

# 
print(pre_process("We love you,@HillaryClinton! We are always #StillWithHer! 💙 You deserved to win!#recount #faithlesselectors #AuditTheVote You are a hero! 💙"))



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['love', ',', '@', 'hillaryclinton', '!', 'always', '#', 'stillwithher', '!', '💙', 'deserved', 'win', '!', '#', 'recount', '#', 'faithlesselectors', '#', 'auditthevote', 'hero', '!', '💙']


In [5]:
global_feature_dict = {}  # A global dictionary of features

def to_feature_vector(tokens):
    feature_vector = {}

    def add_feature_n(feature):
        if feature not in global_feature_dict:
            global_feature_dict[feature] = len(global_feature_dict)
        feature_index = global_feature_dict[feature]
        feature_vector[feature_index] = feature_vector.get(feature_index, 0) + 1

    # Adding unigrams
    for word in tokens:
        add_feature_n(word)

    # Adding bigrams
    for i in range(len(tokens) - 1):
        bigram = (tokens[i], tokens[i + 1])
        add_feature_n(bigram)

    return feature_vector

# Example usage
tokens = ["this", "is", "a", "sample", "text"]
feature_vector = to_feature_vector(tokens)
print(feature_vector)


{0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}


In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER

def train_classifier(data):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(data)

In [7]:
def cv_results(results):

    sum_precision = 0
    sum_recall = 0
    sum_f1_score = 0
    n = len(results)

    for result in results:
        
        sum_precision += result['weighted avg']['precision']
        sum_recall += result['weighted avg']['recall']
        sum_f1_score += result['weighted avg']['f1-score']

    
    avg_precision = sum_precision / n
    avg_recall = sum_recall / n
    avg_f1_score = sum_f1_score / n
    print('Average Precision:', avg_precision)
    print('Average Recall:', avg_recall)
    print('Average F1 Score:', avg_f1_score)
    return avg_precision, avg_recall, avg_f1_score
    



In [8]:

from sklearn.metrics import classification_report

def cross_validate(dataset, folds):
    results = []
    fold_size = int(len(dataset) / folds) + 1

    for i in range(0, len(dataset), fold_size):
        test_data = dataset[i:i + fold_size]
        train_data = dataset[:i] + dataset[i + fold_size:]

        classifier = train_classifier(train_data)
        predicted = predict_labels([sample[0] for sample in test_data], classifier)
        actual = [sample[1] for sample in test_data]

        results.append(classification_report(actual, predicted, output_dict=True))
        print("Fold start on items %d - %d" % (i, i + fold_size))

    return cv_results(results)



In [9]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predict_labels(samples, classifier):
    """Assuming preprocessed samples, return their predicted labels from the classifier model."""
    return classifier.classify_many(samples)

def predict_label_from_raw(sample, classifier):
    """Assuming raw text, return its predicted label from the classifier model."""
    processed_sample = to_feature_vector(pre_process(sample))
    return classifier.classify(to_feature_vector(preProcess(reviewSample)))

In [10]:

# loading reviews
def load_data(path, raw_data):
    with open(path, 'r', encoding='utf-8') as f:
      reader = csv.reader(f, delimiter='\t')
      next(reader, None)  # Skip header
      for line in reader:
       label, text = parse_data_line(line)
       raw_data.append((text, label))
# initialize global lists that will be appended to by the methods below
def split_and_preprocess_data(raw_data, percentage):
    num_samples = len(raw_data)
    num_training_samples = int(percentage * num_samples)

    # Make sure to create a tuple (featureset, label) for each sample
    train_data = [(to_feature_vector(pre_process(text)), label) for text, label in raw_data[:num_training_samples]]
    test_data = [(to_feature_vector(pre_process(text)), label) for text, label in raw_data[num_training_samples:]]

    return train_data, test_data


raw_data = []          # the filtered data from the dataset file
train_data = []        # the pre-processed training data as a percentage of the total dataset
test_data = []         # the pre-processed test data as a percentage of the total dataset


# references to the data files
data_file_path = 'sentiment-dataset.tsv'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing the dataset...",sep='\n')

load_data(data_file_path,raw_data) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing training and test data...",sep='\n')

train_data, test_data=split_and_preprocess_data(raw_data,0.8)

# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...


Now 33539 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 33539 rawData, 26831 trainData, 6708 testData
Training Samples: 
26831
Features: 
350389


In [11]:
pred=cross_validate(train_data, 10)  # will work and output overall performance of p, r, f-score when cv implemented

print(pred)
   

Training Classifier...


In [None]:
# Finally, check the accuracy of your classifier by training on all the traning data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(test_data[0])   # have a look at the first test data instance
    classifier = train_classifier(train_data)  # train the classifier
    test_true = [t[1] for t in test_data]   # get the ground-truth labels from the data
    test_pred = predict_labels([x[0] for x in test_data], classifier)  # classify the test data to get predicted labels
    final_scores = precision_recall_fscore_support(test_true, test_pred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % final_scores[:3])

({148: 1, 44: 2, 2421: 1, 142016: 1, 140: 1, 118: 1, 292323: 1, 292324: 1, 9: 1, 292325: 1, 146: 2, 287: 1, 85401: 1, 7661: 1, 5593: 1, 51855: 1, 292326: 1, 292327: 1, 50902: 1, 292328: 1, 292329: 1, 292330: 1, 292331: 1, 292332: 1, 793: 1, 17237: 1, 292333: 1, 292334: 1, 205215: 1}, 'positive')
Training Classifier...




Done training!
Precision: 0.859621
Recall: 0.861061
F Score:0.859627
