In [None]:
!pip install -q nlu

In [None]:
!pip install -q transformers
!pip install -q sentencepiece

In [None]:
pip install googletrans==4.0.0-rc1

In [None]:
import pandas as pd
from googletrans import Translator
import re
from textblob import TextBlob
import seaborn as sns
%matplotlib inline
from transformers import pipeline
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from wordcloud import STOPWORDS
from collections import defaultdict

In [None]:
df=pd.read_csv('NLP_Project - Sheet9.csv')

In [None]:
df

In [None]:
df.dtypes

In [None]:
translator = Translator()

def translate_to_english(row):
    if row['Language'] == 'Marathi':
        translation = translator.translate(row['Article'], src='mr', dest='en')
        return translation.text
    elif row['Language'] == 'Telugu':
        translation = translator.translate(row['Article'], src='te', dest='en')
        return translation.text
    elif row['Language'] == 'Tamil':
        translation = translator.translate(row['Article'], src='ta', dest='en')
        return translation.text
    else:
        return row['Article']

df['Translated_Article'] = df.apply(translate_to_english, axis=1)

In [None]:
df

In [None]:
def clean_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    return text

# Applying the clean_text function
df['Translated_Article'] = df['Translated_Article'].apply(clean_text)
df

In [None]:
df_mum=df.loc[df['City'] == 'Mumbai']
df_mum

In [None]:
def show_wordcloud(data, title = None):
    stopwords = set(STOPWORDS)
    stopwords.update(["mumbai","India", "said", "year", "say",'maharashtra','state','climate','change','says','city','chennai','hyderabad','days','cities','telangana'])
    wordcloud = WordCloud(
        stopwords=stopwords,
        background_color='black'
).generate(" ".join(data))

    fig = plt.figure(1, figsize=(15, 15))
    plt.axis('off')
    if title:
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [None]:
# Wordcloud for Articles From Mumbai.
show_wordcloud(df_mum["Translated_Article"].values)

In [None]:
df_Chen=df.loc[df['City'] == 'Chennai']
df_Chen

In [None]:
show_wordcloud(df_Chen["Translated_Article"].values)

In [None]:
df_hyd=df.loc[df['City'] == 'Hyderabad']
df_hyd

In [None]:
show_wordcloud(df_hyd["Translated_Article"].values)

In [None]:
#TextBlob for polarity and subjectivity
polarity=[]
subjectivity=[]
for txt in df['Translated_Article']:
    blobSenti=TextBlob(txt)
    polarity.append(blobSenti.sentiment.polarity)
    subjectivity.append(blobSenti.sentiment.subjectivity)
df['Polarity']=polarity
df['Subjectivity']=subjectivity


In [None]:
df

In [None]:
plt.figure(figsize=(5,6))
sns.boxplot(data=df, x="City", y="Polarity", hue="City").set(title='Polarity across Cities')
plt.show()



In [None]:
plt.figure(figsize=(5,6))
sns.boxplot(data=df, x="City", y="Subjectivity", hue="City").set(title='Subjectivity across Cities')
plt.show()

In [None]:
classifier=pipeline("zero-shot-classification")

In [None]:
labels = ['positive', 'neutral', 'negative']
posNeuNeg = defaultdict(list)

cities = ['Mumbai', 'Hyderabad', 'Chennai']

for city in cities:
    sequence = " ".join(df.loc[df['City'] == city]['Translated_Article'])
    scores = classifier(sequence, labels)['scores']
    posNeuNeg[city].append(scores)




In [None]:
posNeuNeg

In [None]:
pnn = pd.DataFrame.from_dict({city: values[0] for city, values in posNeuNeg.items()}, orient='index', columns=['Positive', 'Negative', 'Neutral'])

In [None]:
pnn

In [None]:
pnn.plot(kind='bar', rot=0, figsize=(10, 6), colormap='viridis')
plt.title('Positive, Negative and Neutral Scores by City')
plt.ylabel('Sentiment Score')
plt.xlabel('City')
plt.show()

In [None]:
labels=['angry','fear','happy','sad','surprise']
emotions=defaultdict(list)


for city in cities:
    sequence = " ".join(df.loc[df['City'] == city]['Translated_Article'])
    scores = classifier(sequence, labels)['scores']
    emotions[city].append(scores)



In [None]:
emotions

In [None]:
emo = pd.DataFrame.from_dict({city: values[0] for city, values in emotions.items()}, orient='index', columns=['angry','fear','happy','sad','surprise'])

In [None]:
emo

In [None]:
# Plotting
emo.plot(kind='bar', rot=0, figsize=(10, 6), colormap='viridis')
plt.title('Emotion Scores by City')
plt.ylabel('Sentiment Score')
plt.xlabel('City')
plt.show()

In [None]:
labels=['Optimism','Injustice','disgust']
intent=defaultdict(list)


for city in cities:
    sequence = " ".join(df.loc[df['City'] == city]['Translated_Article'])
    scores = classifier(sequence, labels)['scores']
    intent[city].append(scores)


In [None]:
intt = pd.DataFrame.from_dict({city: values[0] for city, values in intent.items()}, orient='index', columns=labels)

In [None]:
intt

In [None]:
# Plotting
intt.plot(kind='bar', rot=0, figsize=(10, 6), colormap='viridis')
plt.title('Intent Scores by City')
plt.ylabel('Sentiment Score')
plt.xlabel('City')
plt.show()

In [None]:

labels=['Cause','Impact','Mitigation','Adaption']
aspect=defaultdict(list)


for city in cities:
    sequence = " ".join(df.loc[df['City'] == city]['Translated_Article'])
    scores = classifier(sequence, labels)['scores']
    aspect[city].append(scores)

In [None]:
asp = pd.DataFrame.from_dict({city: values[0] for city, values in aspect.items()}, orient='index', columns=labels)

In [None]:
asp

In [None]:
asp.plot(kind='bar', rot=0, figsize=(10, 6), colormap='viridis')
plt.title('Aspect Scores by City')
plt.ylabel('Sentiment Score')
plt.xlabel('City')
plt.show()

In [None]:
labels=['Informed','Uninformed','Misinformed']
awarness=defaultdict(list)


for city in cities:
    sequence = " ".join(df.loc[df['City'] == city]['Translated_Article'])
    scores = classifier(sequence, labels)['scores']
    awarness[city].append(scores)

In [None]:
awar = pd.DataFrame.from_dict({city: values[0] for city, values in awarness.items()}, orient='index', columns=labels)

In [None]:
awar.plot(kind='bar', rot=0, figsize=(10, 6), colormap='viridis')
plt.title('Awarness Scores by City')
plt.ylabel('Sentiment Score')
plt.xlabel('City')
plt.show()

In [None]:
labels=['Liberal','Conservative']
ideology=defaultdict(list)


for city in cities:
    sequence = " ".join(df.loc[df['City'] == city]['Translated_Article'])
    scores = classifier(sequence, labels)['scores']
    ideology[city].append(scores)

In [None]:
ideology

In [None]:
idg = pd.DataFrame.from_dict({city: values[0] for city, values in ideology.items()}, orient='index', columns=labels)

In [None]:
idg

In [None]:
idg.plot(kind='bar', rot=0, figsize=(10, 6), colormap='viridis')
plt.title('Sentiment Scores by City')
plt.ylabel('Sentiment Score')
plt.xlabel('City')
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 12))

pnn.plot(ax=axes[0, 0],kind='bar', rot=0, colormap='viridis')
axes[0, 0].set_title('PosNegNeu')

emo.plot(ax=axes[0, 1],kind='bar', rot=0, colormap='viridis')
axes[0, 1].set_title('Emotions')


intt.plot(ax=axes[1, 0],kind='bar', rot=0, colormap='viridis')
axes[1, 0].set_title('Intent')

asp.plot(ax=axes[1, 1],kind='bar', rot=0, colormap='viridis')
axes[1, 1].set_title('Aspect')

plt.tight_layout()

plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,7))  # Define 1 row and 2 columns of subplots

awar.plot(ax=axes[0], kind='bar', rot=0, colormap='viridis')
axes[0].set_title('Awareness')

idg.plot(ax=axes[1], kind='bar', rot=0, colormap='viridis')
axes[1].set_title('Ideology')

plt.tight_layout()

plt.show()

#Non Parametric Test ( Kruskal-Wallis H test )

In [None]:
Final=pd.concat([pnn,emo,intt,asp,awar,idg], axis=1).T

In [None]:
Final

In [None]:
Final.describe()

In [None]:
import scipy.stats as stats

# Perform the Kruskal-Wallis H test
statistic, p_value = stats.kruskal(Final['Mumbai'],Final['Hyderabad'], Final['Chennai'])

# Output the results
print("Kruskal-Wallis H Test:")
print("H-statistic =", statistic)
print("P-value =", p_value)

# Check for statistical significance
alpha = 0.05
if p_value < alpha:
    print("The differences between groups are statistically significant.")
else:
    print("No significant differences between groups were found.")
