In [None]:
# Import Libraries

from textblob import TextBlob
import sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import nltk
import re
import string
import seaborn as sns


from PIL import Image
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer


In [None]:
# List of file paths
file_paths = [
    "/ukraine-war-tweets-dataset-65-days/Russia_invade.csv",
    "/ukraine-war-tweets-dataset-65-days/Russian_border_Ukraine.csv",
    "/ukraine-war-tweets-dataset-65-days/Russian_troops.csv",
    "/ukraine-war-tweets-dataset-65-days/StandWithUkraine.csv",
    "/ukraine-war-tweets-dataset-65-days/Ukraine_border.csv",
    "/ukraine-war-tweets-dataset-65-days/Ukraine_nato.csv",
    "/ukraine-war-tweets-dataset-65-days/Ukraine_troops.csv",
    "/ukraine-war-tweets-dataset-65-days/Ukraine_war.csv"
]

# Load all CSV files into a single DataFrame
tweets = pd.concat([pd.read_csv(file) for file in file_paths], ignore_index=True)


In [None]:
tweets.drop_duplicates(inplace=True)  # remove duplicates


In [None]:
# slicing the date , and removing the time portion
tweets['date'] = tweets.date.str.slice(0, 10)


In [None]:
# checking all the unique dates in the dataset
print(tweets['date'].unique())  

# checking how many unique language
# tweets are present in the dataset
print(tweets["lang"].unique())


In [None]:
# before removing the non-english tweets
print(tweets.shape)

# removing all the tweets expect the
# non-english tweets
tweets = tweets[tweets['lang'] == 'en']
print("After removing non-english Tweets")

# only the number of english tweets
print(tweets.shape)  


In [None]:
# Removing RT, Punctuation etc
def remove_rt(x): return re.sub('RT @\w+: ', " ", x)

def rt(x): return re.sub(
    "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", x)

tweets["content"] = tweets.content.map(remove_rt).map(rt)
tweets["content"] = tweets.content.str.lower()


In [None]:
tweets[['polarity', 'subjectivity']] = tweets['content'].apply(
    lambda Text: pd.Series(TextBlob(Text).sentiment))

for index, row in tweets['content'].iteritems():
    score = SentimentIntensityAnalyzer().polarity_scores(row)
    neg = score['neg']
    neu = score['neu']
    pos = score['pos']
    comp = score['compound']
    
    if neg > pos:
        tweets.loc[index, 'sentiment'] = "negative"
    elif pos > neg:
        tweets.loc[index, 'sentiment'] = "positive"
    else:
        tweets.loc[index, 'sentiment'] = "neutral"
        
    tweets.loc[index, 'neg'] = neg
    tweets.loc[index, 'neu'] = neu
    tweets.loc[index, 'pos'] = pos
    tweets.loc[index, 'compound'] = comp


In [None]:
tweets[["content", "sentiment", "polarity",
        "subjectivity", "neg", "neu", "pos"]].head(5)


In [None]:
total_pos = len(tweets.loc[tweets['sentiment'] == "positive"])
total_neg = len(tweets.loc[tweets['sentiment'] == "negative"])
total_neu = len(tweets.loc[tweets['sentiment'] == "neutral"])
total_tweets = len(tweets)
print("Total Positive Tweets % : {:.2f}"
      .format((total_pos/total_tweets)*100))
print("Total Negative Tweets % : {:.2f}"
      .format((total_neg/total_tweets)*100))
print("Total Neutral Tweets % : {:.2f}"
      .format((total_neu/total_tweets)*100))


In [None]:
mylabels = ["Positive", "Negative", "Neutral"]
mycolors = ["Green", "Red", "Blue"]

plt.figure(figsize=(8, 5),
           dpi=600)  # Push new figure on stack
myexplode = [0, 0.2, 0]
plt.pie([total_pos, total_neg, total_neu], colors=mycolors,
        labels=mylabels, explode=myexplode)
plt.show()


In [None]:
pos_list = []
neg_list = []
neu_list = []
for i in tweets["date"].unique():
    temp = tweets[tweets["date"] == i]
    positive_temp = temp[temp["sentiment"] == "positive"]
    negative_temp = temp[temp["sentiment"] == "negative"]
    neutral_temp = temp[temp["sentiment"] == "neutral"]
    pos_list.append(((positive_temp.shape[0]/temp.shape[0])*100, i))
    neg_list.append(((negative_temp.shape[0]/temp.shape[0])*100, i))
    neu_list.append(((neutral_temp.shape[0]/temp.shape[0])*100, i))

neu_list = sorted(neu_list, key=lambda x: x[1])
pos_list = sorted(pos_list, key=lambda x: x[1])
neg_list = sorted(neg_list, key=lambda x: x[1])

x_cord_neg = []
y_cord_neg = []

x_cord_pos = []
y_cord_pos = []

x_cord_neu = []
y_cord_neu = []

for i in neg_list:
    x_cord_neg.append(i[0])
    y_cord_neg.append(i[1])


for i in pos_list:
    x_cord_pos.append(i[0])
    y_cord_pos.append(i[1])

for i in neu_list:
    x_cord_neu.append(i[0])
    y_cord_neu.append(i[1])


plt.figure(figsize=(16, 9),
           dpi=600)  # Push new figure on stack
plt.plot(y_cord_neg, x_cord_neg, label="negative", 
         color="red")
plt.plot(y_cord_pos, x_cord_pos, label="positive", 
         color="green")
plt.plot(y_cord_neu, x_cord_neu, label="neutral", 
         color="blue")
plt.xticks(np.arange(0, len(tweets["date"].unique()) + 1, 5))
plt.xticks(rotation=90)
plt.grid(axis='x')

plt.legend()
