In [1]:
import csv
import pandas as pd

Mounted at /drive


In [2]:
tru_data = pd.read_csv('/Users/segurithvik/Downloads/tru_data.csv') # Reading csv files true data set

In [3]:
fake_data = pd.read_csv('/Users/segurithvik/Downloads/fake_data.csv') #Reading fake data set

In [4]:
print(tru_data['text'].isnull().sum()) # checking for null values
print(fake_data['text'].isnull().sum())

0
0


In [5]:
tru_data['text'] = tru_data['text'].str.lower() # converting all the characters in text column to lower case
fake_data['text'] = fake_data['text'].str.lower()

In [6]:
tru_data = tru_data.head(50) # selecting only first 50 rows for testing
fake_data = fake_data.head(50)

In [7]:
import nltk
nltk.download('punkt') # Tokenize each article into sentences
tru_data['sentences'] = tru_data['text'].apply(nltk.sent_tokenize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
fake_data['sentences'] = fake_data['text'].apply(nltk.sent_tokenize) # Tokenize each article into sentences

**Choosing a Data set for training purpose**
For my analysis I chose the sentence polarity that was taught in the class, which gives the sentiment polarity for each movie review as either positive or negative, the reason I chose this corpus was there are equal number of positive and negative words, which will not show any bias to positive or negative words in the sentence while predicting the polarity of words in a sentence in true and fake data set, for better accuracy. If our training data set is biased to either of the polarity we may not get good accuracy for our model, which will not be a good analysis.

In [9]:
# movie review sentences
import nltk
nltk.download('sentence_polarity')
from nltk.corpus import sentence_polarity
import random

sentences = sentence_polarity.sents() # getting sentence corpus
print(len(sentences))
print(sentence_polarity.categories())


pos_sents = sentence_polarity.sents(categories='pos')
print(len(pos_sents))

neg_sents = sentence_polarity.sents(categories='neg')
print(len(neg_sents))

[nltk_data] Downloading package sentence_polarity to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping corpora/sentence_polarity.zip.


10662
['neg', 'pos']
5331
5331


In [10]:
docs = [(sent, cat) for cat in sentence_polarity.categories() #sent contains the list of words in the sentence and cat gives the label
	for sent in sentence_polarity.sents(categories=cat)]


print(docs[0])
print(docs[-1])

(['simplistic', ',', 'silly', 'and', 'tedious', '.'], 'neg')
(['provides', 'a', 'porthole', 'into', 'that', 'noble', ',', 'trembling', 'incoherence', 'that', 'defines', 'us', 'all', '.'], 'pos')


In [11]:
random.shuffle(docs) # here we will be shuffing the the docs, which will ensure that we will not only get the first few pos or neg polarity words

all_words_list = [word for (sent,cat) in docs for word in sent]
all_words = nltk.FreqDist(all_words_list)

word_items = all_words.most_common(2000) # we will be limiting it to only commonly occured 2000 words
word_features = [word for (word,count) in word_items]
print(word_features[:50])

['.', 'the', ',', 'a', 'and', 'of', 'to', 'is', 'in', 'that', 'it', 'as', 'but', 'with', 'film', 'this', 'for', 'its', 'an', 'movie', "it's", 'be', 'on', 'you', 'not', 'by', 'about', 'one', 'more', 'like', 'has', 'are', 'at', 'from', 'than', '"', 'all', '--', 'his', 'have', 'so', 'if', 'or', 'story', 'i', 'too', 'just', 'who', 'into', 'what']


In [12]:
def doc_features(doc, word_features):
    doc_words = set(doc)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in doc_words)
    return features

featuresets = [(doc_features(d, word_features), c) for (d, c) in docs] # gives features for the document

featuresets[0]


({'V_.': True,
  'V_the': False,
  'V_,': True,
  'V_a': True,
  'V_and': False,
  'V_of': False,
  'V_to': False,
  'V_is': False,
  'V_in': True,
  'V_that': False,
  'V_it': False,
  'V_as': False,
  'V_but': False,
  'V_with': False,
  'V_film': False,
  'V_this': False,
  'V_for': False,
  'V_its': False,
  'V_an': False,
  'V_movie': False,
  "V_it's": False,
  'V_be': False,
  'V_on': False,
  'V_you': False,
  'V_not': False,
  'V_by': False,
  'V_about': False,
  'V_one': False,
  'V_more': False,
  'V_like': False,
  'V_has': False,
  'V_are': False,
  'V_at': False,
  'V_from': False,
  'V_than': False,
  'V_"': False,
  'V_all': True,
  'V_--': False,
  'V_his': False,
  'V_have': False,
  'V_so': False,
  'V_if': False,
  'V_or': False,
  'V_story': False,
  'V_i': False,
  'V_too': False,
  'V_just': False,
  'V_who': False,
  'V_into': False,
  'V_what': False,
  'V_most': False,
  'V_out': False,
  'V_no': False,
  'V_much': False,
  'V_even': False,
  'V_good': False,


**Model 1 - Classifier**
Here we will be training our first model, by using naive Bayes classifier, by just using the docs(which contains the list of words in the sentence and label to it) and word features. We will training the model with 3 fold cross validation, and finally we will be calculating the average precision, re-call and F-measure,in this case precision turned out to be 0.75, re-call and F- measure turned out to be 0.74

In [13]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support

kf = KFold(n_splits=3, shuffle=True, random_state=35)  # 3-fold cross-validation

Precisions = []
Recalls = []
Fscores = []

X = [feat for feat, label in featuresets] # Splitting the featuresets into X (features) and y (labels)
y = [label for feat, label in featuresets]

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X): # splitting into train and test

    X_train = [X[i] for i in train_index]
    y_train = [y[i] for i in train_index]
    X_test = [X[i] for i in test_index]
    y_test = [y[i] for i in test_index]


    train_set = [(X_train[i], y_train[i]) for i in range(len(train_index))] # training with naive bayes classifier
    classifier = nltk.NaiveBayesClassifier.train(train_set)


    y_prediction = [classifier.classify(X_test[i]) for i in range(len(test_index))]


    precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_prediction, average='binary', pos_label='pos') #calculating recall, fscore and precision
    Precisions.append(precision)
    Recalls.append(recall)
    Fscores.append(fscore)

average_precision = sum(Precisions) / len(Precisions)
average_recall = sum(Recalls) / len(Recalls)
average_fscore = sum(Fscores) / len(Fscores)

print(f"Average Precision: {average_precision:.2f}")
print(f"Average Recall: {average_recall:.2f}")
print(f"Average F-measure: {average_fscore:.2f}")


Average Precision: 0.75
Average Recall: 0.74
Average F-measure: 0.74


In [14]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english') #we will be getting stopwords
print(len(stopwords))
print(stopwords)


179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor']

# remove some negation words
negationwords.extend(['ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'])

In [16]:
newstopwords = [word for word in stopwords if word not in negationwords]
print(len(newstopwords))

157


In [17]:
new_all_words_list = [word for (sent,cat) in docs for word in sent if word not in newstopwords]

In [18]:
new_all_words = nltk.FreqDist(new_all_words_list)
new_word_items = new_all_words.most_common(2000)
new_word_features = [word for (word,count) in new_word_items]
print(new_word_features[:30])

['.', ',', 'film', 'movie', 'not', 'one', 'like', '"', '--', 'story', 'no', 'much', 'even', 'good', 'comedy', 'time', 'characters', 'little', 'way', 'funny', 'make', 'enough', 'never', 'makes', 'may', 'us', 'work', 'best', 'bad', 'director']


In [19]:
def doc_featuresA(document, new_word_features):
    doc_wordsA = set(document)
    featuresA = {}
    for word in new_word_features:
        featuresA['V_{}'.format(word)] = (word in doc_wordsA)
    return featuresA

featuresetsA = [(doc_featuresA(d, new_word_features), c) for (d, c) in docs] # get features sets for a document, including keyword features and category feature

featuresetsA[0]

({'V_.': True,
  'V_,': True,
  'V_film': False,
  'V_movie': False,
  'V_not': False,
  'V_one': False,
  'V_like': False,
  'V_"': False,
  'V_--': False,
  'V_story': False,
  'V_no': False,
  'V_much': False,
  'V_even': False,
  'V_good': False,
  'V_comedy': False,
  'V_time': False,
  'V_characters': False,
  'V_little': False,
  'V_way': False,
  'V_funny': False,
  'V_make': False,
  'V_enough': False,
  'V_never': False,
  'V_makes': False,
  'V_may': False,
  'V_us': False,
  'V_work': False,
  'V_best': False,
  'V_bad': False,
  'V_director': False,
  'V_)': False,
  'V_?': False,
  'V_(': False,
  'V_love': False,
  'V_would': False,
  'V_life': False,
  'V_:': False,
  "V_there's": False,
  'V_movies': False,
  'V_well': False,
  'V_new': False,
  'V_could': False,
  'V_something': False,
  'V_really': False,
  'V_made': False,
  'V_performances': False,
  'V_many': False,
  "V_that's": False,
  'V_drama': False,
  'V_look': False,
  'V_plot': False,
  'V_films': False,


**Model 2 - Classifier A**
Initially for our model 1(Classifier) we did not care about stopwords nor negation words, in our second model we will be removing the stop words and negation words to see how our model performs. Even in this case we will be using naive bayes classifier with 3 fold cross validation, this time we got the precision  as 0.74, and recall score as 0.72, fscore as 0.73 which is lesser than our model 1

In [20]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support

kf = KFold(n_splits=3, shuffle=True, random_state=35)

# Lists to store the precision, recall, and F-measure for each fold
PrecisionsA = []
RecallsA = []
FscoresA = []

# Split the featuresets into X (features) and y (labels)
Xa = [feat for feat, label in featuresetsA]
Ya = [label for feat, label in featuresetsA]

# Perform k-fold cross-validation
for train_index, test_index in kf.split(Xa):
    # Split data into training and testing sets for this fold
    X_trainA = [Xa[i] for i in train_index]
    Y_trainA = [Ya[i] for i in train_index]
    X_testA = [Xa[i] for i in test_index]
    Y_testA = [Ya[i] for i in test_index]

    # Train Naive Bayes classifier on training set of this fold
    train_setA = [(X_trainA[i], Y_trainA[i]) for i in range(len(train_index))]
    classifierA = nltk.NaiveBayesClassifier.train(train_setA)

    # Predict on test set of this fold
    Y_predictionA = [classifierA.classify(X_testA[i]) for i in range(len(test_index))]

    # Calculate precision, recall, and F-measure for this fold
    precisionA, recallA, fscoreA, _ = precision_recall_fscore_support(Y_testA, Y_predictionA, average='binary', pos_label='pos')
    PrecisionsA.append(precisionA)
    RecallsA.append(recallA)
    FscoresA.append(fscoreA)

# Calculate average precision, recall, and F-measure over all folds
average_precisionA = sum(PrecisionsA) / len(PrecisionsA)
average_recallA = sum(RecallsA) / len(RecallsA)
average_fscoreA = sum(FscoresA) / len(FscoresA)

print(f"Average Precision: {average_precisionA:.2f}")
print(f"Average Recall: {average_recallA:.2f}")
print(f"Average F-measure: {average_fscoreA:.2f}")


Average Precision: 0.74
Average Recall: 0.72
Average F-measure: 0.73


In [21]:
def NOT_features(document, word_features, negationwords):
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = False
        features['V_NOT{}'.format(word)] = False

    for i in range(0, len(document)):
        word = document[i]
        if ((i + 1) < len(document)) and ((word in negationwords) or (word.endswith("n't"))):
            i += 1
            features['V_NOT{}'.format(document[i])] = (document[i] in word_features)
        else:
            features['V_{}'.format(word)] = (word in word_features)
    return features
# define the feature sets
NOT_featuresets = [(NOT_features(d, word_features, negationwords), c) for (d, c) in docs]

print(NOT_featuresets[0][0]['V_NOTcare'])
print(NOT_featuresets[0][0]['V_always'])

False
False


**Model 3 - Classifier B**
In our last model i.e model 3(Classifier B), we are using the word features which we used in the model 1, plus the special negation feature, for our naive bayes classifier. We will be calculating the precision, recall and fscore by performing 3 fold cross validation, this time the score turned out to be 0.77 for all the three measures which is higher than the previous 2 models.

In [22]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support

kf = KFold(n_splits=3, shuffle=True, random_state=35)  # 3-fold cross-validation

# Lists to store the precision, recall, and F-measure for each fold
PrecisionsB = []
RecallsB = []
FscoresB = []

# Split the featuresets into X (features) and y (labels)
Xb = [feat for feat, label in NOT_featuresets]
Yb = [label for feat, label in NOT_featuresets]

# Perform k-fold cross-validation
for train_index, test_index in kf.split(Xb):
    # Split data into training and testing sets for this fold
    X_trainB = [Xb[i] for i in train_index]
    Y_trainB = [Yb[i] for i in train_index]
    X_testB = [Xb[i] for i in test_index]
    Y_testB = [Yb[i] for i in test_index]

    # Train Naive Bayes classifier on training set of this fold
    train_setB = [(X_trainB[i], Y_trainB[i]) for i in range(len(train_index))]
    classifierB = nltk.NaiveBayesClassifier.train(train_setB)

    # Predict on test set of this fold
    Y_predictionB = [classifierB.classify(X_testB[i]) for i in range(len(test_index))]

    # Calculate precision, recall, and F-measure for this fold
    precisionB, recallB, fscoreB, _ = precision_recall_fscore_support(Y_testB, Y_predictionB, average='binary', pos_label='pos')
    PrecisionsB.append(precisionB)
    RecallsB.append(recallB)
    FscoresB.append(fscoreB)

# Calculate average precision, recall, and F-measure over all folds
average_precisionB = sum(PrecisionsB) / len(PrecisionsB)
average_recallB = sum(RecallsB) / len(RecallsB)
average_fscoreB = sum(FscoresB) / len(FscoresB)

print(f"Average Precision: {average_precisionB:.2f}")
print(f"Average Recall: {average_recallB:.2f}")
print(f"Average F-measure: {average_fscoreB:.2f}")


Average Precision: 0.77
Average Recall: 0.77
Average F-measure: 0.77


# **Comparing the three models and their measures**
             
 Model 1 - Precision 0.75 Recall 0.74 Fscore 0.74,
 Model 2 - Precision 0.74 Recall 0.72 Fscore 0.73,
 Model 3 - Precision 0.77 Recall 0.77 Fscore 0.77
Precision measures of all the positive predictions made by the model, how many were actually correct.
Recall measures of all the actual positive instances, how many did the model correctly identify
F score it is simply the average between precision and recall.
Model three outperforms the other models in all the three measures, suggesting it to be a better model, which used feature words(which contains the list of words in the sentence and label to it) and special negation features. So we will be choosing model 3(Classifier B) for our prediction, since it has better measures than the remaining 2 models.

In [23]:
import nltk
nltk.download('punkt')
from collections import Counter


# Initialize lists to store the predicted sentiments and counts for each article
tru_data['sentence_sentiments'] = None
tru_data['pos_count'] = 0
tru_data['neg_count'] = 0

# Predict the sentiment for each sentence and count positive and negative sentiments for each article
for idx, row in tru_data.iterrows():
    article_sentiments = [classifierB.classify(doc_features(nltk.word_tokenize(sentence), word_features)) for sentence in row['sentences']]
    sentiment_counts = Counter(article_sentiments)

    tru_data.at[idx, 'sentence_sentiments'] = article_sentiments
    tru_data.at[idx, 'pos_count'] = sentiment_counts['pos']
    tru_data.at[idx, 'neg_count'] = sentiment_counts['neg']

# Display the results
tru_data[['text',  'pos_count', 'neg_count']].head(20)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,text,pos_count,neg_count
0,washington (reuters) - the head of a conservat...,10,17
1,washington (reuters) - transgender people will...,8,10
2,washington (reuters) - the special counsel inv...,7,10
3,washington (reuters) - trump campaign adviser ...,5,10
4,seattle/washington (reuters) - president donal...,15,20
5,"west palm beach, fla./washington (reuters) - t...",6,15
6,"west palm beach, fla (reuters) - president don...",10,15
7,the following statements were posted to the ve...,1,6
8,the following statements were posted to the ve...,3,5
9,washington (reuters) - alabama secretary of st...,1,1


In [24]:
tru_data.to_csv('/drive/My Drive/Colab Notebooks/IST 664/2023/tru_data.csv',index=False)

In [25]:

# Initialize lists to store the predicted sentiments and counts for each article
fake_data['sentence_sentiments'] = None
fake_data['pos_count'] = 0
fake_data['neg_count'] = 0

# Predict the sentiment for each sentence and count positive and negative sentiments for each article
for idx, row in fake_data.iterrows():
    article_sentiments = [classifierB.classify(doc_features(nltk.word_tokenize(sentence), word_features)) for sentence in row['sentences']]
    sentiment_counts = Counter(article_sentiments)

    fake_data.at[idx, 'sentence_sentiments'] = article_sentiments
    fake_data.at[idx, 'pos_count'] = sentiment_counts['pos']
    fake_data.at[idx, 'neg_count'] = sentiment_counts['neg']

# Display the results
fake_data[['text', 'pos_count', 'neg_count']].head(20)

Unnamed: 0,text,pos_count,neg_count
0,donald trump just couldn t wish all americans ...,13,15
1,house intelligence committee chairman devin nu...,4,7
2,"on friday, it was revealed that former milwauk...",10,15
3,"on christmas day, donald trump announced that ...",4,11
4,pope francis used his annual christmas day mes...,9,10
5,the number of cases of cops brutalizing and ki...,4,8
6,donald trump spent a good portion of his day a...,9,6
7,in the wake of yet another court decision that...,4,3
8,many people have raised the alarm regarding th...,7,8
9,just when you might have thought we d get a br...,7,4


In [26]:
fake_data.to_csv('/drive/My Drive/Colab Notebooks/IST 664/2023/fake_data.csv', index=False)

In [35]:
a = tru_data['pos_count'].sum() # no.of positive words in true data set
a

302

In [36]:
b = tru_data['neg_count'].sum() #no.of negative words in true data set
b

449

In [34]:
per_of_pos_tru = (a/(a+b))*100 #percentage of positive words in true data set
per_of_pos_tru

40.21304926764314

In [37]:
per_of_neg_tru = (b/(a+b))*100 #percentage of negative words in true data set
per_of_neg_tru

59.78695073235686

In [38]:
c = fake_data['pos_count'].sum() # no.of positive words in fake data set
c

299

In [39]:
d = fake_data['neg_count'].sum() # no.of negative words in fake data set
d

486

In [41]:
per_of_pos_fake = (c/(c+d))*100 #percentage of positive words in fake data set
per_of_pos_fake

38.089171974522294

In [42]:
per_of_neg_fake = (d/(c+d))*100 #percentage of negative words in fake data set
per_of_neg_fake

61.910828025477706

# **Comparing the results**
If we compare the percentage of postive words in true and fake data set, the true data set contains 40% of its total words as positive, where as the fake data set contains only 38% of its words to be positive. Now If we compare the percentage of negative words, true data set 60% of negative words whereas the fake data set 62% of negative words. Even if we compare the number of number words the trend continues to be the same. Which suggests us that the fake data set contains more negativity in it, when we compare this result with the results in the HW1, which also stated similar things while comparing the adjectives, which stated that there are more negative words in the fake data set. As both the data sets deal about the united states politics, the fake data set contains more negativity and contains social media profiles, URL's which spread negativity. The true data set is more formal, and uses more positive(polite) sentences. All in all there is more negativity in fake data set but not by a huge margin.

# **Discussion:**
From this assignment I learned to perfrom sentiment analysis for a csv file. The pre processing steps will not be same for all the models it differs from model to model, and also the data set provided. Choosing a good data set for training the data is very important you have to make sure the data set is not biased to one emotion and it should contain a label column in it stating the emotion. which should be categorical. This assignment helped me understand the sentiment analysis in a better way.Faced few challanges while doing it but eventually completed it and learned a lot about the sentiment analysis