In [None]:
import pandas as pd
data = pd.read_csv("Tweets_Data.csv",encoding = "ISO-8859-1")

In [None]:
data.head()

In [None]:
data = data.iloc[:,1:]
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.info()

In [None]:
import re #regular expression
import string

def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub("[0-9" "]+"," ",text)
    text = re.sub('[‘’“”…]', '', text)
    return text

clean = lambda x: clean_text(x)
data['Text'] = data.Text.apply(clean)
data.Text
data_df = pd.DataFrame(data)
data_df

In [None]:
#Word frequency
freq = pd.Series(' '.join(data['Text']).split()).value_counts()[:20] # for top 20
freq

In [None]:
#removing stopwords
from nltk.corpus import stopwords
stop = pd.read_csv('stop.txt')
data['Text'] = data['Text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [None]:
#word frequency after removal of stopwords
freq_Sw = pd.Series(' '.join(data['Text']).split()).value_counts()[:20] # for top 20
freq_Sw

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# count vectoriser tells the frequency of a word.
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
vectorizer = CountVectorizer(min_df = 1, max_df = 0.9)
X = vectorizer.fit_transform(data["Text"])
word_freq_df = pd.DataFrame({'term': vectorizer.get_feature_names_out(), 'occurrences':np.asarray(X.sum(axis=0)).ravel().tolist()})
word_freq_df['frequency'] = word_freq_df['occurrences']/np.sum(word_freq_df['occurrences'])
#print(word_freq_df.sort('occurrences',ascending = False).head())

In [None]:
word_freq_df.head(30)

In [None]:
#TFIDF - Term frequency inverse Document Frequency
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features= 1000, max_df = 0.5, smooth_idf=True) #keep top 1000 words
doc_vec = vectorizer.fit_transform(data["Text"])
names_features = vectorizer.get_feature_names_out()
dense = doc_vec.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns = names_features)

In [None]:
df

In [None]:
#Bi-gram
def get_top_n2_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(2,2),  #for tri-gram, put ngram_range=(3,3)
            max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1],
                reverse=True)
    return words_freq[:n]

In [None]:
top2_words = get_top_n2_words(data["Text"], n=200) #top 200
top2_df = pd.DataFrame(top2_words)
top2_df.columns=["Bi-gram", "Freq"]
top2_df.head()

In [None]:
top2_df

In [None]:
#Tri-gram
def get_top_n3_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(3,3),
           max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1],
                reverse=True)
    return words_freq[:n]
top3_words = get_top_n3_words(data["Text"], n=200)
top3_df = pd.DataFrame(top3_words)
top3_df.columns=["Tri-gram", "Freq"]

In [None]:
top3_df

In [None]:
#Sentiment analysis
afinn = pd.read_csv('Afinn.csv', sep=',', encoding='latin-1')
afinn.shape
(2477, 2)
afinn.head()

In [None]:
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))
affinity_scores = afinn.set_index('word')['value'].to_dict()
take(20, affinity_scores.items())

In [None]:
import spacy
#Custom function :score each word in a sentence in lemmatised form,
#but calculate the score for the whole original sentence.
nlp = spacy.load('en_core_web_sm')
sentiment_lexicon = affinity_scores

def calculate_sentiment(data: str = None) -> float:
    sent_score = 0
    if data:
        sentence = nlp(data)
        for word in sentence:
            sent_score += sentiment_lexicon.get(word.lemma_, 0)
    return sent_score
# test that it works
calculate_sentiment(data = 'very sad')
-2
data_df['sentiment_value'] = data_df['Text'].apply(calculate_sentiment)
data_df

In [None]:
# how many words are in the Text?
data_df['word_count'] = data_df['Text'].str.split().apply(len)
data_df['word_count'].head(10)

In [None]:
data_df.sort_values(by='sentiment_value').tail(10)

In [None]:
# Sentiment score of the whole review
data_df['sentiment_value'].describe()

In [None]:
# Sentiment score of the negative reviews
data_df[data_df['sentiment_value']<0].head(10)

In [None]:
# Sentiment score of the positive reviews
data_df[data_df['sentiment_value']>0].head(10)

In [None]:
data_df[data_df['sentiment_value']>10].head(10)

In [None]:
data_df

In [None]:
result = []
for value in data_df["sentiment_value"]:
    if value < 0 :
        result.append("Negative")
    elif value == 0 :
        result.append("Neutral")
    else :
        result.append("Positive")

data_df["sentiment_value"] = result
print(data_df)

In [None]:
data_df