In [1]:
import pandas as pd
import numpy as np

import re
import nltk
from wordcloud import WordCloud
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit

from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

import date_fixer

from googletrans import Translator

In [2]:
#example using translator Arabic to Engligh
translator = Translator()

translation = translator.translate("مرحبا كيف حالك؟",  dest='en')
translation.text

'Hello how are you?'

In [3]:
#example using translator English to Arabic
translator = Translator()

translation = translator.translate("Hello, how are you?",  dest='ar')
translation.text

'مرحبًاّ! كيف حالك؟'

In [4]:
df = pd.read_csv('daily8_3.csv')
new = date_fixer.make_datetime(df)

TypeError: 'NoneType' object is not subscriptable

In [None]:
def encode_sentiment(df, columns_to_encode):
    dummies_list = columns_to_encode

    dummy_df = pd.get_dummies(df[dummies_list], drop_first=False)
    encoded = pd.concat([df, dummy_df], axis = 1)
    return encoded

In [None]:
encoded = encode_sentiment(no_false, ['text_label'])
encoded.info()

In [None]:
encoded = encoded[encoded.text_label != 'False']
encoded.text_label.value_counts()

In [None]:
encoded.text_label_negative.value_counts()

In [None]:
plt.figure(figsize = (18, 10))
sns.histplot(data=encoded, x='dateline', hue= 'source', bins=50)

In [None]:
plt.figure(figsize = (18, 12))
sns.displot(data=encoded, x='dateline', col= 'source', bins=50)

### Techreen
- Biggst spike in articles is in 2009 
        events:
        - Barack as 1st president in Jan 2009.
#### Sabanews
- Most articles written around 2012-2014.
#### SaudiYourm
- Most of the articles were written 2002-2004.
        Events: 
        - Iraq War
#### Yourm7
- One small time for articles in 2008 , Great Recession?
- Most articles written between 2012 - 2014 
            Events: 
            - ISIS?
#### Almasryalyoum
- 2008 - 2013
#### Ryiadh
- 2003 - 2009
#### Alittihad 
- 2008 - 2014
#### Almustaqbal
- 2004 - 2011
#### Alqabas
- 2006 - 2014
#### Echoroukonline
Not significant, very little amount of articles only in 2013.

In [None]:
df = encoded.set_index('dateline').sort_index()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.info

In [None]:
df.headline_label.value_counts()

In [None]:
df.tags.value_counts()

In [None]:
df.text_label.value_counts()

In [None]:
df.source.value_counts()

In [None]:
sns.set_theme(style="white")
ax = sns.countplot(x="source", data=df, palette="Set2",order = new['source'].value_counts().index)
ax.tick_params(axis='x', rotation=90)
ax.set_xlabel('Source', size = 16)
ax.set_ylabel('Article Count', size = 16)
ax.set_title("Articles per website by count", size = 20)
plt.show()

Takeaways:
    
    - Huge spike in articles written around 2011. 
    - Most of the articles are written by the souce Yourm7, which is also the top source overall.
    - The other 2 highest times for articles written:
        - Right after 2012 - don't have an event for beginning - mid of 2012?
        - Beginning of 2013 - Boston Marathon shooting?

In [None]:
def country_tagger(df):
    country_map = { 'Alittihad': 'emirates',
                    'Echoroukonline': 'algeria',
                    'Ryiadh': 'ksa',
                    'SaudiYoum': 'ksa',
                    'Techreen': 'syria',
                    'Alqabas': 'kuwait',
                    'Almustaqbal': 'lebanon',
                    'Almasryalyoum': 'egypt',
                    'Youm7': 'egypt',
                    'Sabanews': 'yemen',
                    }
    df['country'] = df.source.map(country_map)
    return df

In [None]:
country_tagger(df)

## What’s the relationship between article sentiment and world events?

- Hypothesis Testing not possible currently

In [None]:
plt.figure(figsize = (12, 8))
sns.lineplot(data = df.resample('1M').sum(), x = 'dateline', y = 'text_label_negative', label = 'negative')
sns.lineplot(data = df.resample('1M').sum(), x = 'dateline', y = 'text_label_neutral', label = 'neutral')
sns.lineplot(data = df.resample('1M').sum(), x = 'dateline', y = 'text_label_positive', label = 'positive')
plt.axvline(dt.datetime(2001, 9, 11),  color = 'black') ## 9/11
plt.axvline(dt.datetime(2001, 10, 22), color = 'black') ## US leads assault on Taliban in Afghanistan
plt.axvline(dt.datetime(2002, 10, 11), color = 'black') ## Congress authorizes forces against Iraq
plt.axvline(dt.datetime(2003, 3, 19), color = 'black') ## US invades Iraq, starting war
plt.axvline(dt.datetime(2004, 2, 4), color = 'black') ## Facebook is formed
plt.axvline(dt.datetime(2004, 11, 7), color = 'black') ## Start of the Second Battle of Fallujah
plt.axvline(dt.datetime(2005, 2, 14), color = 'black') ## Youtube is created
plt.axvline(dt.datetime(2005, 6, 23), color = 'black') ## Reddit is created
plt.axvline(dt.datetime(2006, 2, 22), color = 'black') ## al-Askari Mosque bombing, Start of Iraqi War
plt.axvline(dt.datetime(2006, 3, 21), color = 'black') ## Twitter is created
plt.axvline(dt.datetime(2006, 12, 30), color = 'black') ## Saddam Hussein executed
plt.axvline(dt.datetime(2007, 4, 16), color = 'black') ## Virginia Tech Mass Shooting
plt.axvline(dt.datetime(2008, 9, 15), color = 'black') ## Start of Great Recession
plt.axvline(dt.datetime(2009, 1, 20), color = 'black') ## Barack Obama elected first black president
plt.axvline(dt.datetime(2010, 8, 30), color = 'black') ## US combat roll in Iraq ends
plt.axvline(dt.datetime(2011, 2, 14), color = 'black') ## Start of Arab Spring
plt.axvline(dt.datetime(2011, 5, 2), color = 'black') ## Navy Seals take down Osama Bin Laden
plt.axvline(dt.datetime(2011, 9, 19), color = 'black') ## Occupy Wallstreet
plt.axvline(dt.datetime(2012, 12, 14), color = 'black') ## Sandy Hook Shooting
plt.axvline(dt.datetime(2013, 4, 15), color = 'black') ## Boston Marathon Bombing


plt.legend()