In [11]:
import pandas as pd
import numpy as np
import nltk
import import_ipynb
import SerbStemmer
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
import imblearn
import statsmodels.api as sm
import wordcloud
import sklearn
from Serbstemmer import stem_str

from sklearn.externals import joblib
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import RandomOverSampler

from statsmodels.formula.api import ols
from scipy.stats import chi2_contingency
from scipy.stats import shapiro
from scipy.stats import ranksums
from matplotlib.offsetbox import AnchoredText


from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


np.set_printoptions(suppress=True)
nltk.download('punkt')
tokenizer = RegexpTokenizer(r'\w+')

importing Jupyter notebook from Serbstemmer.ipynb


[nltk_data] Downloading package punkt to /Users/ognjand/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Below we combine the coder responses with the original data set and the automated features and prepare the data for machine learning.

In [2]:
train_pd= pd.read_csv("coders.csv", encoding = 'utf-8')
stopwords = pd.read_csv("stopwords.csv")

def connect_frames(name, tag):
    name = pd.read_csv('{}_features.csv'.format(name), sep = ',', encoding = 'utf-8')
    frames = train_pd
    frames = frames.loc[(frames['Q3'] == 'Machine set') & (frames['Q1'] == '{}'.format(tag))]
    frames = frames[['Q2','A', 'B', 'C', 'D', 'E', 'F', 'G']]
    frames['Q2'] = frames['Q2'].apply(int)
    name['Q2'] = name.index
    new = pd.merge(name,frames, how = 'left', on = 'Q2')
    return new
#create all feature data set from all 
tags = ['OD', 'TD', 'SD', 'DD']
ognjan = connect_frames('ognjan', 'OD')
teo = connect_frames('teo', 'TD')
spela = connect_frames('spela', 'SD')
damjan = connect_frames('damjan', 'DD')
train = pd.concat([ognjan,teo, spela, damjan], axis=0, ignore_index=True)
train = train[['text', 'country-source', 'Q2', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'article_len', 'ratio_not_stop_w', 'avg_word_len', 'title_len', 'ratio_unique_words', 'ner']]
train = train.dropna(subset = ['A'])
train['ner'] = train['ner'].str.lower()
train.A = train.A.eq('Yes').mul(1)
train.B = train.B.eq('Yes').mul(1)
train.C = train.C.eq('Yes').mul(1)
train.D = train.D.eq('Yes').mul(1)
train.E = train.E.eq('Yes').mul(1)
train.F = train.F.eq('Yes').mul(1)
train.G = train.G.eq('Yes').mul(1)
#text pre processing
stopwords = stopwords["words"].tolist()
text = train['text'].tolist()
text = [(s[0:s.find('.')]) for s in text]
text = [tokenizer.tokenize(x) for x in text]
text = [[x.lower() for x in thing] for thing in text]
text = [[x for x in thing if x not in stopwords] for thing in text]
text = [" ".join(x) for x in text]
text = [stem_str(x) for x in text] #check this stemmer again

In [4]:
train['country-source'].value_counts()

USA    641
RU     467
Name: country-source, dtype: int64

In [11]:
train.groupby('country-source')['G'].value_counts(normalize = 'true').mul(100)

country-source  G
RU              0    87.794433
                1    12.205567
USA             0    98.283931
                1     1.716069
Name: G, dtype: float64

# Variance Analysis of all features

For the binary categorical data, namely, the frames we use a borrowed approach to Chi-Square analysis from InsightBot(http://www.insightsbot.com/blog/2AeuRL/chi-square-feature-selection-in-python)

# Frame variance analysis

In [None]:
class ChiSquare:
    def __init__(self, dataframe):
        self.df = dataframe
        self.p = None #P-Value
        self.chi2 = None #Chi Test Statistic
        self.dof = None
        
        self.dfObserved = None
        self.dfExpected = None
        
    def _print_chisquare_result(self, colX, alpha):
        result = ""
        if self.p<alpha:
            result="{0} is IMPORTANT for Prediction".format(colX)
        else:
            result="{0} is NOT an important predictor. (Discard {0} from model)".format(colX)

        print(result)
        
    def TestIndependence(self,colX,colY, alpha=0.05):
        X = self.df[colX].astype(str)
        Y = self.df[colY].astype(str)
        
        self.dfObserved = pd.crosstab(Y,X) 
        chi2, p, dof, expected = stats.chi2_contingency(self.dfObserved.values)
        self.p = p
        self.chi2 = chi2
        self.dof = dof 
        
        self.dfExpected = pd.DataFrame(expected, columns=self.dfObserved.columns, index = self.dfObserved.index)
        
        self._print_chisquare_result(colX,alpha)

df = train

#Initialize ChiSquare Class
cT = ChiSquare(df)

#Feature Selection
testColumns = ['D','E','F', 'G']
for var in testColumns:
    cT.TestIndependence(colX=var,colY="country-source" )

## Automated feature variance analysis 
article_len, ratio_not_stop_w, avg_word_len, title_len, ratio_unique_words

First, we divide the data according to source. 
Second, we test assumptions to determine the necessery statistical procedure for variance analysis.

As assumptions for normality are not met we use a Wilcoxon rank sum test for variance analysis. Because the datasets are of different lenghts, hence not paired, we use the Wilcoxon rank sum test.

In [None]:
usa = train[(train['country-source'] == 'USA')]
usa.reset_index(inplace= True)
rus = train[(train['country-source'] == 'RU')]
rus.reset_index(inplace= True)
for feature in ['article_len', 'ratio_not_stop_w', 'avg_word_len', 'title_len', 'ratio_unique_words']:
    stat, p = shapiro(train['{}'.format(feature)])
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    alpha = 0.05
    if p > alpha:
        print('Feature', feature, 'sample looks Gaussian (fail to reject H0)')
    else:
        print('Feature', feature, 'sample does not look Gaussian (reject H0)')
        
for feature in ['article_len', 'ratio_not_stop_w', 'avg_word_len', 'title_len', 'ratio_unique_words']:
    stat, p = ranksums(usa['{}'.format(feature)], rus['{}'.format(feature)])
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    alpha = 0.05
    if p > alpha:
        print('Feature', feature, 'does not vary significantly (fail to reject H0 and DROP)')
    else:
        print('Feature', feature, 'does vary significantly (reject H0 and KEEP)')
        

# Developing the classifiers for frame prediction

## Classifiers which can be tuned for hyperparamters
Classifier help function obtained from http://www.davidsbatista.net/blog/2018/02/23/model_optimization/

In [12]:
from sklearn.model_selection import GridSearchCV

class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=5, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

models1 = {
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'SVC': SVC()
}

params1 = {
    'RandomForestClassifier': { 'n_estimators': [16, 1000] },
    'AdaBoostClassifier':  { 'n_estimators': [16, 600] },
    'GradientBoostingClassifier': { 'n_estimators': [5, 16], 'learning_rate': [0.8, 1.0] },
    'SVC' : [{'kernel': ['linear'], 'C': [1, 10, 100]},  {'kernel': ['rbf'], 'C': [1, 10, 100], 'gamma': [0.001, 0.0001]} ]
}


# Create data set for frame classifier development

In [None]:
issue_frames = []
for t in [6,7,8,9]:
    frame = train.iloc[:,t]
    frame = frame.tolist()
    issue_frames.append(frame)
issue_labels = np.array(issue_frames)
issue_labels = np.ndarray.transpose(issue_labels)

xi_train, xi_test, yi_train, yi_test = train_test_split(text, issue_labels, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer(max_features = 1500, min_df = 1, max_df = 150)
xi_train = vectorizer.fit_transform(xi_train)
xi_test = vectorizer.transform(xi_test)
#here we take the SMOTE sample
sm = SMOTEENN()
#here we select which frame the analysis is run for 
X_res, y_res = sm.fit_sample(xi_train, yi_train[:,0])
#use the helper function defined above to test a number of models and hyperparameters
helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(X_res, y_res, scoring='f1_weighted', n_jobs=2)
helper1.score_summary(sort_by='mean_score')

# Classifiers without hyperparameters

In [None]:
#manually input frame of interest
X_res, y_res = sm.fit_sample(xi_train, yi_train[:,0])
classifiers =  [LogisticRegression(), MultinomialNB(), GaussianNB()]
for clas in classifiers:
    all_accuracies = cross_val_score(estimator = clas , X=X_res, y=y_res, cv=5, scoring = 'f1_weighted') 
    print(sum(all_accuracies)/5)

# Test the best model for each frame with a number of parameters options for the Count and Tfidf Vectorizer (min and max features and df)

In [None]:
import warnings
import itertools
warnings.filterwarnings('ignore')
features = [[1000, 1500, 2500,4000, 5000, 8000, 10000, 12000,15000, 20000], [1, 5, 10, 50, 100, 200], [50, 100, 150, 250, 300, 450, 500, 1000]]
combinations = list(itertools.product(*features))
vectorizers = [CountVectorizer, TfidfVectorizer]
xi_train, xi_test, yi_train, yi_test = train_test_split(text, issue_labels, test_size=0.2, random_state=42)
models = []
f1 = []
vects = []
i = 1
for e in combinations:
    for vectorize in vectorizers:
        if e[2] > e[1]: #if max_df > min_df 
            try:
                vectorizer = vectorize(max_features = e[0], min_df =e[1], max_df = e[2])
                xi_train = vectorizer.fit_transform(xi_train)
                xi_test = vectorizer.transform(xi_test)
                #Here we input the best classifier for that frame based on the cell above
                classifier = LogisticRegression()
                #here we again create the smote sample and the frame of interest
                sm = SMOTEENN()
                X_resampled, y_resampled = sm.fit_sample(X, y)
                X_res, y_res = sm.fit_sample(xi_train, yi_train[:,3])
                classifier.fit(X_res, y_res)
                predictions = classifier.predict(xi_test)
                #here we also input the frame of interest
                actual = yi_test[:,3]
                if metrics.f1_score(actual, predictions, average = 'weighted') > 0.6:
                    f1.append(metrics.f1_score(actual, predictions, average = 'weighted'))
                    vects.append((vectorize, e[0], e[1], e[2]))
                ints = f1.index(max(f1))
                model = [vects[ints], max(f1)]
                del y_res, X_res
            except:
                pass
        i+=1
        print(i)
        xi_train, xi_test, yi_train, yi_test = train_test_split(text, issue_labels, test_size=0.2, random_state=42)
models.append(model)
#save the best model

# Applying the best model and saving vectorizers and trained classifiers

In [None]:
#Apply the best model and report precision and recall rates, as well as f1 from here
#The figures presented here are thus the outcome of the best model and hyperparameter combination for that frame 
#Then that model is combined with the best vectorizer option 
xi_train, xi_test, yi_train, yi_test = train_test_split(text, issue_labels, test_size=0.20)
vectorizer = TfidfVectorizer(max_features = 15000, min_df=1, max_df=150)
xi_train = vectorizer.fit_transform(xi_train)
xi_test = vectorizer.transform(xi_test)

#a smote sample is once again taken
sm = SMOTEENN()
#The frame of interest is manually input
X_res, y_res = sm.fit_sample(xi_train, yi_train[:,1])
smote = RandomForestClassifier(n_estimators=16).fit(X_res, y_res)
smote_pred = smote.predict(xi_test)
#The frame of interest is manually input
actual = yi_test[:,1]
print(classification_report(actual, smote_pred))
classifier.fit(X_res, y_res)
predictions = classifier.predict(xi_test)
print(classification_report(predictions, actual))
#save the vectorizer and classifier for each frame
joblib.dump(vectorizer, 'issue_3_vec.pkl')
joblib.dump(classifier, 'issue_3_clas.pkl')

# Predict frames in entire data set to conduct the country classification

In [None]:
m = pd.read_csv("complete_dataset.csv")
test = pd.read_csv('test_features.csv')
stopwords = pd.read_csv('stopwords.csv')
stopwords = stopwords["words"].tolist()
all_text = test['text'].tolist()
all_text = [(s[0:s.find('.')]) for s in all_text]
all_text = [tokenizer.tokenize(x) for x in all_text]
all_text = [[x.lower() for x in thing] for thing in all_text]
all_text = [[x for x in thing if x not in stopwords] for thing in all_text]
all_text = [" ".join(x) for x in all_text]
all_text = [stem_str(x) for x in all_text] #check this stemmer again
predicting_set = m[~m.isin(train)].dropna()

models = [('issue_0_clas.pkl', 'issue_0_vec.pkl'), ('issue_1_clas.pkl', 'issue_1_vec.pkl'), ('issue_2_clas.pkl', 'issue_2_vec.pkl'), ('issue_3_clas.pkl', 'issue_3_vec.pkl')]
predictions = []

for model, vec in models:
    loaded_model = joblib.load('{}'.format(model))
    loaded_vec = joblib.load('{}'.format(vec))
    count = loaded_vec.transform(all_text)
    prediction = loaded_model.predict(count)
    predictions.append(prediction)

serbian_vic = pd.DataFrame(predictions[0])
anti_west = pd.DataFrame(predictions[1])
pro_russian = pd.DataFrame(predictions[2])
russian_might = pd.DataFrame(predictions[3])
test['serbian_vic'] = pd.DataFrame(predictions[0])
test['anti_west'] = pd.DataFrame(predictions[1])
test['pro_russian'] = pd.DataFrame(predictions[2])
test['russian_might'] = pd.DataFrame(predictions[3])
train = train.rename(columns={'D': 'serbian_vic', 'E': 'anti_west', 'F': 'pro_russian', 'G': 'russian_might'})
del train['C']
alldata = train.append(test, ignore_index=True)
alldata.to_csv("new_complete")

# Country source classifcation 

In [None]:
all_data = pd.read_csv('new_complete')
#preparing the Named Entity Recognition COLUMN 
featuress = alldata['ner'].tolist() 
clean_ner = []
import re
import math
for x in featuress:
    try:
        x = re.sub(r'[^\w ]', '', x)
        clean_ner.append(x)
    except:
        clean_ner.append("string")
y = np.array(all_data['country-source'].values)

loaded_vec = joblib.load('issue_0_vec.pkl')
count = loaded_vec.transform(clean_ner)
count = count.toarray()


# several feature selection analyses are conducted first as outlined in the study
all_data_m = alldata
all_data_m=all_data_m.rename(columns = {'country-source':'source'})
all_data_m.source = all_data_m.source.eq('USA').mul(1)
all_data_m['source'].value_counts()
y = np.array(all_data_m['source'].values)
features = all_data_m.iloc[:,2:15]
features_n = ['article_len','ratio_not_stop_w','avg_word_len', 'title_len', 'ratio_unique_words',  'B','D', 'E', 'F']
X = np.array(features[features_n].values)
xg_test, xg_train, yg_test, yg_train = train_test_split(X, y, test_size = 0.2, random_state =42)

#univariate feature selection
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, y)
np.set_printoptions(precision=3)
print(fit.scores_)
#random trees feature selection
model = ExtraTreesClassifier()
model.fit(X, y)
print(model.feature_importances_)
#Recursive Feature Selection
model = LogisticRegression(solver='lbfgs')
rfe = RFE(model, 4)
fit = rfe.fit(X, y)
fit.n_features_
fit.support_
fit.ranking_
#Variance threshold feature selection
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
m = sel.fit_transform(X)


#outputs generated from these analyses are manually input in the feature list two cells below - 'features_n'

In [None]:
from sklearn.model_selection import GridSearchCV

class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=5, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

models1 = {
    'SGD': SGD(),
    'RandomForestClassifier': RandomForestClassifier(),
    'KNeigboursClassifier': KNeighboursClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'SVC': SVC(),
}

params1 = {
    'SGD' : {'alpha':[0.01, 0.001, 0.0001]},
    'KNeighborsClassifier': { 'n_neighbors': [16, 300, 500, 1000], 'leaf_size' :[30, 50, 100, 200]},
    'RandomForestClassifier': { 'n_estimators': [16, 300, 500, 1000], 'max_depth': [10, 50, 100, 1000] },
    'AdaBoostClassifier': { 'n_estimators': [16, 300, 600], 'learning_rate': [0.6, 1.0] },
    'GradientBoostingClassifier': { 'n_estimators': [16, 300, 500], 'learning_rate': [0.6, 0.8, 1.0]},
    'SVC' : [ {'kernel': ['linear'], 'C': [1, 10, 100]}  {'kernel': ['rbf'], 'C': [1, 10, 100], 'gamma': [0.01, 0.001, 0.0001]}],

 
}


# Evaluating model and hyperparameter combinations for each feature set combination

In [None]:
all_data_m = alldata
y = np.array(all_data_m['country-source'].values)
#feature combinations manually input
features_n = ['article_len','ratio_not_stop_w','avg_word_len', 'title_len', 'ratio_unique_words', 'serbian_vic', 'anti_west', 'pro_russian', 'russian_might']
X = np.array(all_data_m[features_n].values)
X = np.column_stack((X,count))
xg_train, xg_test, yg_train, yg_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
#classifier manually input
#NOTE THAT SMOTE+ENN IS NOT APPLIED FOR THE COUNTRY CLASSIFICATION TASK
helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(X_res, y_res, scoring='f1_weighted', n_jobs=2)
helper1.score_summary(sort_by='mean_score')

# Models without hyperparameters

In [None]:
#manually input frame of interest
X_res, y_res = sm.fit_sample(xg_train, xg_train[:,0])
classifiers =  [LogisticRegression(), MultinomialNB(), GaussianNB()]
for clas in classifiers:
    all_accuracies = cross_val_score(estimator = clas , X=X_res, y=y_res, cv=5, scoring = 'f1_weighted') 
    print(sum(all_accuracies)/5)

# Data Visualization

# Figures 1 and 2 - frame distribution

In [None]:
#DATA WRANGLING - VALUES MANUALLY INPUT
#for figure 1
df = pd.DataFrame({"Serbian victimhood": [66, 60],
"Anti-West": [34, 81],
 "Pro-Russian": [14, 39],
 "Russian might": [11, 57]}).T
df = df.reset_index()
df.columns = ['frame','USA','RUS']

#for figure 2
df = pd.DataFrame({"Serbian victimhood": [205, 304],
"Anti-West": [80, 107],
 "Pro-Russian": [188, 426],
 "Russian might": [110, 376]}).T
df = df.reset_index()
df.columns = ['frame','USA','RUS']

dflong = df.melt(id_vars='frame').rename({"value":"yes", "variable": "country"},axis=1)
dflong
import numpy as np
#for figure 1
dflong['no'] = np.where(dflong['country']=='USA', 641 - dflong['yes'], 467 - dflong['yes'])
#for figure 2 
dflong['no'] = 5860 - dflong['yes']

df_reconstructed1 = dflong.loc[dflong.index.repeat(dflong.yes)]
df_reconstructed1['value'] = 1
df_reconstructed1.drop(['yes','no'], axis=1, inplace=True)
df_reconstructed1
df_reconstructed2 = dflong.loc[dflong.index.repeat(dflong.no)]
df_reconstructed2['value'] = 0
df_reconstructed2.drop(['yes','no'], axis=1, inplace=True)
df_reconstructed2
df_reconstructed = pd.concat([df_reconstructed1, df_reconstructed2])


#Visualising
sns.set(palette="gray_r", font_scale = 1.1)
plt.figure(figsize=(10,6))
myplot = sns.barplot(data=df_reconstructed, y='value', x = 'frame', hue='country', ci=95)absolute_values = df_reconstructed.groupby(['frame', 'country']).agg(sum)['value']

y = np.array([0, 0.30])

for i, p in enumerate(myplot.patches):

    myplot.annotate(format(p.get_height(), '.1%'), 
                    (p.get_x() + p.get_width() / 2., 
                     p.get_height()), 
                    ha = 'center', 
                    #va = 'center',
                    xytext = (-23, 20), textcoords = 'offset points')
    
    myplot.annotate(format(f"n={absolute_values[i]}", ''), 
                    (p.get_x() + p.get_width() / 2., 
                     p.get_height()), 
                    ha = 'center', 
                    #va = 'center',
                    xytext = (-23, 5), textcoords = 'offset points')
plt.yticks(np.arange(y.min(), y.max(), 0.05))
plt.grid(axis='y', linestyle='-')

# Automated feature variation - Figure 3 

In [None]:
all_data = pd.read_csv('complete_dataset.csv', sep = ',')
all_data = all_data[all_data['article_len'] < 10000]#drop one outlier of 10,000
automated = all_data
automated['country-source'].value_counts()

all_data = pd.read_csv('complete_dataset.csv', sep = ',')
from scipy.stats import zscore
all_data = pd.read_csv('complete_dataset.csv', sep = ',')
all_data = all_data[all_data['article_len'] < 10000]#drop one outlier of 10,000
automated = all_data
automated = automated[['country-source', 'article_len', 'title_len', 'ratio_not_stop_w', 'ratio_not_stop_w_t','avg_word_len', 'ratio_unique_words']]
automated[['article_len', 'title_len','ratio_not_stop_w', 'ratio_not_stop_w_t','avg_word_len', 'ratio_unique_words']] /= automated[['article_len', 'ratio_not_stop_w', 'ratio_not_stop_w_t','avg_word_len', 'title_len', 'ratio_unique_words']].max()
automated[['article_len', 'title_len','ratio_not_stop_w', 'ratio_not_stop_w_t','avg_word_len', 'ratio_unique_words']] = automated[['article_len', 'title_len','ratio_not_stop_w', 'ratio_not_stop_w_t','avg_word_len', 'ratio_unique_words']].apply(zscore)

usa = automated.loc[automated['country-source'] == 'USA']
rus = automated.loc[automated['country-source'] == 'RU']
usa = [usa[i].tolist() for i in usa.columns]
rus = [rus[i].tolist() for i in rus.columns]
usa.pop(0)
rus.pop(0)


ticks = ['Article length', 'Title length', 'Ratio substantive words - text', 'Ratio substantive words - title', 'Avgerage word length', 'Ratio unique words']

def set_box_color(bp, color):
    plt.setp(bp['boxes'], color=color)
    plt.setp(bp['whiskers'], color=color)
    plt.setp(bp['caps'], color=color)
    plt.setp(bp['medians'], color=color)

    
plt.figure()



bpl = plt.boxplot(usa, positions=np.array(range(len(usa)))*2.0-0.4, sym='', widths=0.6)
bpr = plt.boxplot(rus, positions=np.array(range(len(rus)))*2.0+0.4, sym='', widths=0.6)

set_box_color(bpl, '#D7191C')
set_box_color(bpr, '#2C7BB6')

# draw temporary red and blue lines and use them to create a legend
plt.plot([],'-.', c='#D7191C', label='USA')
plt.plot([], c='#2C7BB6', label='RUS')
plt.legend()
plt.xticks(range(0, len(ticks) * 2, 2), ticks)
plt.xticks(rotation=70)
plt.xlim(-2, len(ticks)*2)
for whisker in bpl['whiskers']:
    whisker.set_linestyle('-.')
plt.savefig('Automated_features.png', bbox_inches="tight", dpi = 150)
plt.show()



# Word cloud by country source - Figure 4


In [None]:
#USA
usa = train.loc[train['country-source'] == 'USA']
rus = train.loc[train['country-source'] == 'RU']
text = usa['text'].tolist()
text = [(s[0:s.find('.')]) for s in text]
text = [tokenizer.tokenize(x) for x in text]
text = [[x.lower() for x in thing] for thing in text]
text = [[x for x in thing if x not in stopwords and x != 'izvor' and x !='n1'] for thing in text]# we drop source and outlet name
text_usa = " ".join(x for review in text for x in review)
wordcloud = WordCloud().generate(text_usa)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

text = rus['text'].tolist()
text = [(s[0:s.find('.')]) for s in text]
text = [tokenizer.tokenize(x) for x in text]
text = [[x.lower() for x in thing] for thing in text]
text = [[x for x in thing if x not in stopwords and x != 'izvor' and x !='vostok' and x!='rt'] for thing in text]
text_rus = " ".join(x for review in text for x in review)

wordcloud = WordCloud().generate(text_rus)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Figures 5 and 6 were created in Excel