<a id="1"></a>

<h1 style="background-color:#3c78aa;font-family:newtimeroman;font-size:250%;text-align:center;border-radius: 15px 50px;">Libraires And Utilities</h1>



In [None]:
import numpy as np 
import pandas as pd 
import re
import string
import nltk
import matplotlib.pyplot as plt
plt.style.use('seaborn-ticks')
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as ex
import plotly.graph_objs as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
pyo.init_notebook_mode()
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud,STOPWORDS
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.cluster import DBSCAN
from sklearn.manifold import Isomap
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [None]:
russian_data=pd.read_csv('russia_data.csv')
russian_data.head(5)

In [None]:
russian_data.isnull().sum()

In [None]:
title_data = russian_data[['Title','Date_utc']].copy()
title_data = title_data.dropna()

# Renaming title column to 'title'
title_data.rename(columns={'Title':'title','Date_utc':'timestamp'}, inplace=True)


title_data.title =title_data.title.str.lower()

#Remove handlers
title_data.title = title_data.title.apply(lambda x:re.sub('@[^\s]+','',x))

# Remove URLS
title_data.title = title_data.title.apply(lambda x:re.sub(r"http\S+", "", x))

# Remove all the special characters
title_data.title = title_data.title.apply(lambda x:' '.join(re.findall(r'\w+', x)))

#remove all single characters
title_data.title = title_data.title.apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))

# Substituting multiple spaces with single space
title_data.title = title_data.title.apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))


In [None]:
title_data.head()


<a id="3.1"></a>

<h1 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Sentiment Feature Creation</h1>

In [None]:
sid = SIA()

title_data['sentiments']           = title_data['title'].apply(lambda x: sid.polarity_scores(' '.join(re.findall(r'\w+',x.lower()))))
title_data['Positive Sentiment']   = title_data['sentiments'].apply(lambda x: x['pos']+1*(10**-6)) 
title_data['Neutral Sentiment']    = title_data['sentiments'].apply(lambda x: x['neu']+1*(10**-6))
title_data['Negative Sentiment']   = title_data['sentiments'].apply(lambda x: x['neg']+1*(10**-6))

title_data.drop(columns=['sentiments'],inplace=True)

<a id="3.2"></a>

<h1 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Naive Feature Extraction</h1>

In [None]:
title_data['# Of Words']          = title_data['title'].apply(lambda x: len(x.split(' ')))
title_data['# Of StopWords']      = title_data['title'].apply(lambda x: len([word for word in x.split(' ') if word in list(STOPWORDS)]))
title_data['Average Word Length'] = title_data['title'].apply(lambda x: np.mean(np.array([len(va) for va in x.split(' ') if va not in list(STOPWORDS)])))

<a id="3.2"></a>

<h1 style="background-color:#3c78aa;font-family:newtimeroman;font-size:250%;text-align:center;border-radius: 15px 50px;">Title Text Sentiment Analysis</h1>

In [None]:
plt.figure(figsize=(10,5))
plt.title('Distriubtion Of Title Sentiments Across Posts',fontsize=19,fontweight='bold')
plt.xlabel('Sentiment Score',fontsize=15)
# change size of tick labels
plt.tick_params(labelsize=15)
plt.ylabel('Frequency',fontsize=15)
plt.hist(title_data['Positive Sentiment'],bins=25,alpha=0.5,label='Positive Sentiment',color='blue')
plt.hist(title_data['Neutral Sentiment'],bins=25,alpha=0.5,label='Neutral Sentiment',color='orange')
plt.hist(title_data['Negative Sentiment'],bins=25,alpha=0.5,label='Negative Sentiment',color='red')
plt.legend()
plt.rcParams.update({'legend.fontsize': '20'})
plt.show()

In [None]:
plt.figure(figsize=(15,5))
plt.title('Distriubtion Of Sentiments Across Our Posts',fontsize=19,fontweight='bold')
sns.kdeplot(title_data['Negative Sentiment'],bw_method=0.1)
sns.kdeplot(title_data['Positive Sentiment'],bw_method=0.1)
sns.kdeplot(title_data['Neutral Sentiment'],bw_method=0.1)
plt.legend(['Negative Sentiment','Positive Sentiment','Neutral Sentiment'])
plt.rcParams.update({'legend.fontsize': '15'})
plt.show()


In [None]:
plt.figure(figsize=(10,5))
plt.title('CDF Of Sentiments Across Our Posts',fontsize=19,fontweight='bold')
sns.kdeplot(title_data['Negative Sentiment'],bw_method=0.1,cumulative=True)
sns.kdeplot(title_data['Positive Sentiment'],bw_method=0.1,cumulative=True)
sns.kdeplot(title_data['Neutral Sentiment'],bw_method=0.1,cumulative=True)
plt.xlabel('Sentiment Value',fontsize=19)
plt.legend(['Negative Sentiment','Positive Sentiment','Neutral Sentiment'])
plt.rcParams.update({'legend.fontsize': '15'})
plt.show()

**Observation**: Apparently, The dominant sentiment among the Reddit post titles is by far neutral. Even more, there is a probability of 60% that a post title is classified to be completely neutral.

<a id="4.2"></a>

<h1 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Title Text Decomposition Analysis</h1>


In [None]:
Most_Positive = title_data[title_data['Positive Sentiment'].between(0.4,1)]['title']
Most_Negative = title_data[title_data['Negative Sentiment'].between(0.25,1)]['title']

Most_Positive_text = ' '.join(Most_Positive)
Most_Negative_text = ' '.join(Most_Negative)


pwc = WordCloud(width=600,height=400,collocations = False,background_color='white').generate(Most_Positive_text)

plt.subplot(1,1,1)
plt.title('Common Words Among Most Positive Post Titles',fontsize=16,fontweight='bold')
plt.imshow(pwc)
plt.axis('off')


plt.show()

In [None]:
nwc = WordCloud(width=600,height=400,collocations = False,background_color='white').generate(Most_Negative_text)

plt.subplot(1,1,1)
plt.title('Common Words Among Most Negative Post Titles',fontsize=16,fontweight='bold')
plt.imshow(nwc)
plt.axis('off')
plt.show()

In [None]:
NUMBER_OF_COMPONENTS = 450

CVZ = CountVectorizer()
SVD = TruncatedSVD(NUMBER_OF_COMPONENTS)

text_data = title_data.title.copy()
text_data = text_data.apply(lambda x: ' '.join([word for word in x.split() if word not in STOPWORDS and len(word) > 1]).strip())

stemmer= PorterStemmer()
lemmatizer=WordNetLemmatizer()

text_data = text_data.apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))
text_data = text_data.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

C_vector = CVZ.fit_transform(text_data)


pc_matrix = SVD.fit_transform(C_vector)

evr = SVD.explained_variance_ratio_
total_var = evr.sum() * 100
cumsum_evr = np.cumsum(evr)

trace1 = {
    "name": "individual explained variance", 
    "type": "bar", 
    'y':evr}
trace2 = {
    "name": "cumulative explained variance", 
    "type": "scatter", 
     'y':cumsum_evr}
data = [trace1, trace2]
layout = {
    "xaxis": {"title": "Principal components"}, 
    "yaxis": {"title": "Explained variance ratio"},
  }
fig = go.Figure(data=data, layout=layout)
fig.update_layout(     title='{:.2f}% of the Post Text Variance Can Be Explained Using {} Words'.format(np.sum(evr)*100,NUMBER_OF_COMPONENTS))
fig.show()