In [1]:
# Loading requried libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use(style='seaborn')
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

# Loading Dataset
df=pd.read_csv(r'data/all-data.csv',encoding = "ISO-8859-1")
df.head()

  plt.style.use(style='seaborn')


Unnamed: 0,Sentiment,News Headline
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [2]:
# Checking the distribution
df['Sentiment'].value_counts()

Sentiment
neutral     2879
positive    1363
negative     604
Name: count, dtype: int64

In [3]:
df['Sentiment'] = df['Sentiment'].replace({'negative': 1, 'neutral': 2, 'positive': 3})

In [4]:
# Train Test split
train = df.sample(frac=0.7, random_state=42)
test = df.drop(train.index)

print(train.shape)
print(test.shape)

(3392, 2)
(1454, 2)


For infomation and explanation about n grams refer the nlp.ipynb notebook

In [5]:
def process_text(news,n):
    """Process news function.
    Input:
        news: a string containing a news
    Output:
        news_clean: a list of words containing the processed news

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    news = re.sub(r'\$\w*', '', news)
    # remove old style renews text "RT"
    news = re.sub(r'^RT[\s]+', '', news)
    # remove hyperlinks    
    news = re.sub(r'https?://[^\s\n\r]+', '', news)
    # remove hashtags
    # only removing the hash # sign from the word
    news = re.sub(r'#', '', news)
    
    tokens  = word_tokenize(news)

    cleaned_tokens  = [token.lower() for token in tokens if token.lower() not in stopwords.words('english') and token not in string.punctuation]

    return list(ngrams(cleaned_tokens, n))

In [6]:
process_text("Create unigrams for each of the news records belonging to each of the three categories of sentiments",2)

[('create', 'unigrams'),
 ('unigrams', 'news'),
 ('news', 'records'),
 ('records', 'belonging'),
 ('belonging', 'three'),
 ('three', 'categories'),
 ('categories', 'sentiments')]

In [7]:
train['News_1gram'] = train['News Headline'].apply(lambda x: process_text(x, 1))
train['News_2gram'] = train['News Headline'].apply(lambda x: process_text(x, 2))
train['News_3gram'] = train['News Headline'].apply(lambda x: process_text(x, 3))

In [8]:
train.head()

Unnamed: 0,Sentiment,News Headline,News_1gram,News_2gram,News_3gram
3207,2,The company was supposed to deliver machinery ...,"[(company,), (supposed,), (deliver,), (machine...","[(company, supposed), (supposed, deliver), (de...","[(company, supposed, deliver), (supposed, deli..."
1684,2,UNC Charlotte would also deploy SSH Tectia Con...,"[(unc,), (charlotte,), (would,), (also,), (dep...","[(unc, charlotte), (charlotte, would), (would,...","[(unc, charlotte, would), (charlotte, would, a..."
1044,2,"In 2009 , Lee & Man had a combined annual prod...","[(2009,), (lee,), (man,), (combined,), (annual...","[(2009, lee), (lee, man), (man, combined), (co...","[(2009, lee, man), (lee, man, combined), (man,..."
4145,2,`` That 's a very high figure on the European ...,"[(``,), ('s,), (high,), (figure,), (european,)...","[(``, 's), ('s, high), (high, figure), (figure...","[(``, 's, high), ('s, high, figure), (high, fi..."
1538,2,"In Finland , the corresponding service is Alma...","[(finland,), (corresponding,), (service,), (al...","[(finland, corresponding), (corresponding, ser...","[(finland, corresponding, service), (correspon..."


In [9]:
def build_freqs(news, ys):
    """Build frequencies.
    Input:
        news: a list of news
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all news
    # and over all processed words in each tweet.
    freqs = {}
    for y, new in zip(yslist, news):
        for word in new:
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [10]:
# For this project we will use only 3grams
freqs = build_freqs(train['News_3gram'], train['Sentiment'])

In [11]:
freqs

{(('company', 'supposed', 'deliver'), 2): 1,
 (('supposed', 'deliver', 'machinery'), 2): 1,
 (('deliver', 'machinery', 'veneer'), 2): 1,
 (('machinery', 'veneer', 'mill'), 2): 1,
 (('veneer', 'mill', 'tomsk'), 2): 1,
 (('mill', 'tomsk', 'region'), 2): 1,
 (('tomsk', 'region', 'russia'), 2): 1,
 (('unc', 'charlotte', 'would'), 2): 1,
 (('charlotte', 'would', 'also'), 2): 1,
 (('would', 'also', 'deploy'), 2): 1,
 (('also', 'deploy', 'ssh'), 2): 1,
 (('deploy', 'ssh', 'tectia'), 2): 1,
 (('ssh', 'tectia', 'connector'), 2): 1,
 (('tectia', 'connector', 'enable'), 2): 1,
 (('connector', 'enable', 'secure'), 2): 1,
 (('enable', 'secure', 'application'), 2): 1,
 (('secure', 'application', 'connectivity'), 2): 1,
 (('2009', 'lee', 'man'), 2): 1,
 (('lee', 'man', 'combined'), 2): 1,
 (('man', 'combined', 'annual'), 2): 1,
 (('combined', 'annual', 'production'), 2): 1,
 (('annual', 'production', 'capacity'), 2): 3,
 (('production', 'capacity', 'close'), 2): 1,
 (('capacity', 'close', '4.5'), 2):

In [12]:
train['News_3gram'][3207][0]

('company', 'supposed', 'deliver')

In [13]:
freqs.keys()



In [14]:
def extract_features(tweet):
    x = np.zeros((1, 4)) 
    
    x[0,0] = 1 
    ### START CODE HERE ###
    
    # loop through each word in the list of words
    for word in tweet:
        
        # increment the word count for the negative label 1
        x[0,1] += freqs.get((word,1),0)
        
        # increment the word count for the neutral label 0
        x[0,2] += freqs.get((word,2),0)

        # increment the word count for the positive label 0
        x[0,3] += freqs.get((word,3),0)

    return np.array([x[0,1],x[0,2],x[0,3]])

def token_splitter_pos(a):
    return a[2]

def token_splitter_neg(a):
    return a[0]

def token_splitter_neutral(a):
    return a[1]

In [15]:
train['News_tokenised'] = train['News_3gram'].apply(extract_features)

train['pos'] = train['News_tokenised'].apply(token_splitter_pos)
train['neg'] = train['News_tokenised'].apply(token_splitter_neg)
train['neu'] = train['News_tokenised'].apply(token_splitter_neutral)

In [16]:
train

Unnamed: 0,Sentiment,News Headline,News_1gram,News_2gram,News_3gram,News_tokenised,pos,neg,neu
3207,2,The company was supposed to deliver machinery ...,"[(company,), (supposed,), (deliver,), (machine...","[(company, supposed), (supposed, deliver), (de...","[(company, supposed, deliver), (supposed, deli...","[0.0, 7.0, 0.0]",0.0,0.0,7.0
1684,2,UNC Charlotte would also deploy SSH Tectia Con...,"[(unc,), (charlotte,), (would,), (also,), (dep...","[(unc, charlotte), (charlotte, would), (would,...","[(unc, charlotte, would), (charlotte, would, a...","[0.0, 10.0, 0.0]",0.0,0.0,10.0
1044,2,"In 2009 , Lee & Man had a combined annual prod...","[(2009,), (lee,), (man,), (combined,), (annual...","[(2009, lee), (lee, man), (man, combined), (co...","[(2009, lee, man), (lee, man, combined), (man,...","[0.0, 15.0, 1.0]",1.0,0.0,15.0
4145,2,`` That 's a very high figure on the European ...,"[(``,), ('s,), (high,), (figure,), (european,)...","[(``, 's), ('s, high), (high, figure), (figure...","[(``, 's, high), ('s, high, figure), (high, fi...","[0.0, 15.0, 0.0]",0.0,0.0,15.0
1538,2,"In Finland , the corresponding service is Alma...","[(finland,), (corresponding,), (service,), (al...","[(finland, corresponding), (corresponding, ser...","[(finland, corresponding, service), (correspon...","[1.0, 23.0, 0.0]",0.0,1.0,23.0
...,...,...,...,...,...,...,...,...,...
4772,1,YIT 's Baltic sales in the first three quarter...,"[(yit,), ('s,), (baltic,), (sales,), (first,),...","[(yit, 's), ('s, baltic), (baltic, sales), (sa...","[(yit, 's, baltic), ('s, baltic, sales), (balt...","[16.0, 0.0, 0.0]",0.0,16.0,0.0
4478,3,Tieto was looking for an energy solution which...,"[(tieto,), (looking,), (energy,), (solution,),...","[(tieto, looking), (looking, energy), (energy,...","[(tieto, looking, energy), (looking, energy, s...","[0.0, 0.0, 17.0]",17.0,0.0,0.0
53,3,Seppala 's revenue increased by 0.2 % to EUR10...,"[(seppala,), ('s,), (revenue,), (increased,), ...","[(seppala, 's), ('s, revenue), (revenue, incre...","[(seppala, 's, revenue), ('s, revenue, increas...","[0.0, 0.0, 26.0]",26.0,0.0,0.0
1796,2,"The company has the poser , who wants to impre...","[(company,), (poser,), (wants,), (impress,), (...","[(company, poser), (poser, wants), (wants, imp...","[(company, poser, wants), (poser, wants, impre...","[0.0, 5.0, 0.0]",0.0,0.0,5.0


Now you can use any classifier to fit [pos, neg, neu] with the target variable and get results