In [None]:
import pandas as pd
import os
import regex as re
from datetime import datetime

import modeling as md
import acquire
import exploration
import split
import prepare

%load_ext autoreload

%autoreload 2

In [None]:
all_articles = pd.read_csv('all_articles.csv')
all_articles.head()

In [None]:
curr_time = datetime.now()

daily = pd.read_csv(f'daily{curr_time.month}_{curr_time.day}.csv')

daily.head()

In [None]:
daily.source.value_counts()

In [None]:
import date_fixer

In [None]:
fixed_df = date_fixer.make_datetime(daily).drop(columns=['index', 'Unnamed: 0'])

In [None]:
fixed_df = fixed_df.set_index('dateline')

In [None]:
fixed_df

In [None]:


rev_topics = flip_key_value_pairs(topics)
print(rev_topics)

In [None]:

topics = {'America' : 'أمريكا',
            'American' : 'أمريكيّ',
            'American (f)' : 'أمريكيّة',
            'American (pl)' : 'أمريكيّين',
            'The United States' : 'الولايات المتحدة',
            'The United States' : 'دول موحّدة',
            'Washington' : 'واشنطن',
            'Bush' : 'بوش',
            'Obama' : 'أوباما',
            'Cheney' : 'تشيني',
            'Clinton' : 'كلينتون',
            'Osama Bin Laden' : 'أسامة بن لادن',
            'Al Gore' : 'آل غور',
            'World Trade Center' : 'مركز التجارة العالمي',
            '9/11' : '9/11',
            'September 11' : '11 سبتمبر',
            'Gulf War' : 'حرب الخليج',
            'Google' : 'غوغل',
            'Facebook' : 'فيسبوك',
            'Al Qaida' : 'القاعدة'}


def flip_key_value_pairs(dicts):
    
    res = dict((v,k) for k,v in dicts.items())    
    return res


def make_eng_tags(df_tags):
    eng_tags =[]
    
    for key, value in rev_topics.items():
        if key in df_tags:
            eng_tags.append(value)
            
    return eng_tags

fixed_df.tags.apply(make_eng_tags)

In [None]:
df = pd.read_csv('/Users/kylegreen/My_Drive/arabic_NLP/daily8_4.csv')
df.head()

In [None]:
transcription_table=pd.DataFrame(
    {
        'source': ['Alittihad','Echoroukonline','Ryiadh','SaudiYoum','Techreen', 'Alqabas', 'Almustaqbal','Almasryalyoum', 'Youm7','Sabanews'],
        'year': [ 1969 , 1991 , 1965 , 1965 , 1975 ,  1972 ,  1999 , 2004 , 2008 , 1990]
    }
)
mapping = transcription_table.set_index('source').to_dict()['year']
df['source_start_year'] = df['source'].apply(lambda x: mapping.get(x))

df.head()

In [None]:
df.shape

In [None]:
df =  df[df.text_label != 'False']
df.head()

In [None]:
def country_tagger(df):
    country_map = { 'Alittihad': 'emirates',
                    'Echoroukonline': 'algeria',
                    'Ryiadh': 'ksa',
                    'SaudiYoum': 'ksa',
                    'Techreen': 'syria',
                    'Alqabas': 'kuwait',
                    'Almustaqbal': 'lebanon',
                    'Almasryalyoum': 'egypt',
                    'Youm7': 'egypt',
                    'Sabanews': 'yemen',
                    }
    df['country'] = df.source.map(country_map)
    return df

df = country_tagger(df)
df.head()

In [None]:
list_of_tags = []
for tag in df.tags.values:
    list_of_tags.extend([val[1:-1] for val in tag[1:-1].split(', ')])
    
tag_list = list(set(list_of_tags))

for tag in tag_list:
    df[tag] = 0

for i, tag in enumerate(df.tags):
    for t in tag_list:
        if t in tag:
            df[t].iloc[i] = 1
            
df.head()

In [None]:
df.tags

In [None]:
df[tag_list]

In [None]:
def encode_values(df, columns_to_encode):
    '''
    This function takes in a prepared dataframe and using one-hot encoding, encodes categorical variables. It does not drop the original
    categorical columns. This is done purposefully to allow for easier Exploratory Data Analysis.  Removal of original categorical columns
    will be done in a separate function later if desired.
    Parameters: df - a prepared dataframe with the expected feature names and columns
    Returns: encoded - a dataframe with all desired categorical columns encoded.
    '''
    dummies_list = columns_to_encode

    dummy_df = pd.get_dummies(df[dummies_list], drop_first=False)
    encoded = pd.concat([df, dummy_df], axis = 1)
    return encoded

encoded = encode_values(df, ['source', 'country'])
encoded.head()

In [None]:
features = df[['dateline', 'source', 'country', 'tags']]

In [None]:
df['scaled_date'] = (df.dateline.astype('datetime64') - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')


In [None]:
df.scaled_date

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(df[['scaled_date', 'source_start_year']])

df[['scaled_date', 'source_start_year_scaled']] = scaler.transform(df[['scaled_date', 'source_start_year']])

In [None]:
df

In [None]:
df.columns

In [None]:
tags_to_drop = ['index', 'id', 'url', 'headline', 'dateline', 'text', 'tags', 'source', 'text_score', 'headline_label', 'headline_score',
       'source_start_year', 'country']

In [None]:
#encoded = encoded.drop(columns=tags_to_drop)
encoded = encoded.rename(columns={'scaled_date': 'scaled_pub_date'})

In [None]:
encoded.columns

In [None]:
encoded = encoded.drop(columns=tags_to_drop)

In [None]:
encoded.to_csv('encoded_df.csv', index=False)

In [None]:
train, validate, test = split.train_validate_test_split(encoded, 'text_label')

models = md.all_reports(train, validate, test, 'text_label')

In [None]:
md.Results.total_summary.sort_values(by='validate_accuracy', ascending=False).head()

In [None]:
# original dataframe
df.text_label.value_counts(normalize=True)

In [None]:
num_articles = df.shape[0]
baseline_correct_articles = 0.732755 * num_articles
model_correct_articles = 0.743429 * num_articles0

In [None]:
# num of articles correctly identified by our model compared to baseline
int(model_correct_articles - baseline_correct_articles)

Our model performs 1.46% better than baseline.

In [None]:
md.Results.total_summary.to_csv('model_results.csv', index=False)

In [None]:
results = pd.read_csv('model_results.csv')

In [None]:
results.sort_values(by='validate_accuracy', ascending=False).head(10)

In [None]:
md.random_forests(train, validate, test, 'text_label', min_sample_leaf=4, depth=14)

In [None]:
df = pd.read_csv('daily8_5.csv')
df.head(), df.shape

In [None]:
final_df = df[df.text_label != 'False']

In [None]:
final_df.shape

In [None]:
final_df.to_csv('final_df.csv', index=False)

In [None]:
final_df

In [None]:
final_df.iloc[331637]

In [None]:
df.iloc[331637, df.columns.get_loc('dateline')] = '2008-04-02 00:00:00'

In [None]:
df.shape

In [None]:
final_df = df[df.text_label !='False'].drop(columns='index')

In [None]:
final_df

In [None]:
final_df.to_csv('final_df.csv', index=False)

In [None]:
final_df

In [1]:
import datetime as dt
from sklearn.preprocessing import MinMaxScaler


def encode_data():

    df = pd.read_csv('final_df.csv')
    df.dateline = df.dateline.astype('datetime64')

    def is_gov_controlled(entry):
        if entry in ['Alqabas', 'Echoroukonline', 'Ryiadh', 'Saudiyoum', 'Almustaqbal', 'Youm7', 'Almasryalyoum']:
            return 1
        else:
            return 0

    df['ownership_status'] = df.source.apply(is_gov_controlled)


    def encode_values(df, columns_to_encode):
        '''
        This function takes in a prepared dataframe and using one-hot encoding, encodes categorical variables. It does not drop the original
        categorical columns. This is done purposefully to allow for easier Exploratory Data Analysis.  Removal of original categorical columns
        will be done in a separate function later if desired.
        Parameters: df - a prepared dataframe with the expected feature names and columns
        Returns: encoded - a dataframe with all desired categorical columns encoded.
        '''
        dummies_list = columns_to_encode

        dummy_df = pd.get_dummies(df[dummies_list], drop_first=False)
        encoded = pd.concat([df, dummy_df], axis = 1)
        return encoded

    def country_tagger(df):
        country_map = { 'Alittihad': 'emirates',
                        'Echoroukonline': 'algeria',
                        'Ryiadh': 'ksa',
                        'SaudiYoum': 'ksa',
                        'Techreen': 'syria',
                        'Alqabas': 'kuwait',
                        'Almustaqbal': 'lebanon',
                        'Almasryalyoum': 'egypt',
                        'Youm7': 'egypt',
                        'Sabanews': 'yemen',
                        }
        df['country'] = df.source.map(country_map)
        return df

    def within_30_days(df_dateline, date):

        if (df_dateline - date).days < 30 and (df_dateline - date).days > -30:
            return 1
        else:
            return 0

    important_dates = {
        'september_11th': pd.to_datetime('09-11-2001'),
        'capture_of_baghdad': pd.to_datetime('04-09-2003'),
        'nick_berg': pd.to_datetime('05-12-2004'),
        'iran_nulcear': pd.to_datetime('08-30-2006'),
        'arab_spring': pd.to_datetime('12-20-2011')
    }

    for event, date in important_dates.items():
        df[event] = df.dateline.apply(within_30_days, args = (date,))

    df = country_tagger(df)


    def encode_tags(df):

        list_of_tags = []
        for tag in df.tags.values:
            list_of_tags.extend([val[1:-1] for val in tag[1:-1].split(', ')])

        tag_list = list(set(list_of_tags))

        for tag in tag_list:
            df[tag] = 0

        for i, tag in enumerate(df.tags):
            for t in tag_list:
                if t in tag:
                    df[t].iloc[i] = 1

        return df


    df = encode_tags(df)

    df.head()
    #print('encoding...')
    encoded = encode_values(df, ['source', 'country'])
    encoded.head()

    #print('scaling...')
    df['scaled_date'] = (df.dateline.astype('datetime64') - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

    scaler = MinMaxScaler()

    scaler.fit(df[['scaled_date']])

    df['scaled_date'] = scaler.transform(df[['scaled_date']])
    #print('encoding...')

    #print('dropping non-encoded columns')

    tags_to_drop = ['id', 'url', 'headline', 'dateline', 'text', 'tags', 'source', 'text_score', 'headline_label', 'headline_score', 'country']

    encoded = encoded.rename(columns={'scaled_date': 'scaled_pub_date'})

    encoded = encoded.drop(columns=tags_to_drop)
    #print('splitting...')
    encoded.text_label = encoded.text_label.map({'neutral': 0, 'negative': -1, 'positive': 1})

    return encoded

def model_train_val_test(encoded_df):
    train, validate, test = split.train_validate_test_split(encoded_df, 'text_label')
    clf = RandomForestClassifier(max_depth=14, min_samples_leaf=1)

    x_train = train.drop(columns='text_label')
    y_train = train['text_label']
    
    x_validate = validate.drop(columns='text_label')
    y_validate = validate['text_label']

    x_test = test.drop(columns='text_label')
    y_test = test['text_label']

    clf.fit(x_train, y_train)

    x_predic = clf.predict(x_test)
    train_score = clf.score(x_train, x_train)
    validate_score = clf.score(x_validate, y_validate)
    test_score = clf.score(x_test, y_test)
    
    results = pd.DataFrame({
        'model': 'random_forests',
        'depth': '14',
        'min_sample_leaf': '1',
        'train_acc': round(train_score * 100,1),
        'validate_acc': round(validate_score * 100,1),
        'test_acc': round(test_score * 100, 1)
        })
    
    return results, clf

In [3]:
import pandas as pd
import os
import regex as re
from datetime import datetime

import modeling as md
import acquire
import exploration
import split
import prepare

%load_ext autoreload

%autoreload 2


encoded = encode_data()

In [4]:
encoded

Unnamed: 0,text_label,ownership_status,september_11th,capture_of_baghdad,nick_berg,iran_nulcear,arab_spring,تشيني,أمريكا,القاعدة,...,source_Techreen,source_Youm7,country_algeria,country_egypt,country_emirates,country_ksa,country_kuwait,country_lebanon,country_syria,country_yemen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237701,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
237702,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
237703,-1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
237704,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
features = ['source_Techreen', 'country_syria', 'source_Almasryalyoum', 'country_emirates', 'source_Alittihad', 'source_Youm7', 'source_SaudiYoum', 'country_yemen', 'source_Sabanews', 'country_ksa', 'capture_of_baghdad', 'ownership_status', 'source_Alqabas', 'country_kuwait', 'nick_berg', 'text_label']

words = 'بوش, أمريكا, غوغل, تشيني, حرب الخليج'
words = words.split(', ')
features.extend(words)

to_remove = ['country_syria', 'source_Alittihad', 'source_Sabanews']
[features.remove(x) for x in to_remove]

[None, None, None]

In [None]:
results.to_csv('results.csv', index=False)

In [None]:
results

In [6]:
from sklearn.ensemble import RandomForestClassifier

def model_train_val_test(encoded_df):
    train, validate, test = split.train_validate_test_split(encoded_df, 'text_label')
    clf = RandomForestClassifier(max_depth=14, min_samples_leaf=1, random_state=0)

    x_train = train.drop(columns='text_label')
    y_train = train['text_label']
    
    x_validate = validate.drop(columns='text_label')
    y_validate = validate['text_label']

    x_test = test.drop(columns='text_label')
    y_test = test['text_label']

    clf.fit(x_train, y_train)

    train_score = clf.score(x_train, y_train)
    validate_score = clf.score(x_validate, y_validate)
    test_score = clf.score(x_test, y_test)
    
    results = pd.DataFrame({
        'model': 'random_forests',
        'depth': '14',
        'min_sample_leaf': '1',
        'train_acc': round(train_score * 100,1),
        'validate_acc': round(validate_score * 100,1),
        'test_acc': round(test_score * 100, 1)
        }, index=range(1))
    
    return results, clf


In [11]:
def encode_and_model():
    encoded = encode_data()

    features = ['source_Techreen', 'country_syria', 'source_Almasryalyoum', 'country_emirates', 'source_Alittihad', 'source_Youm7', 'source_SaudiYoum', 'country_yemen', 'source_Sabanews', 'country_ksa', 'capture_of_baghdad', 'ownership_status', 'source_Alqabas', 'country_kuwait', 'nick_berg', 'text_label']

    words = 'بوش, أمريكا, غوغل, تشيني, حرب الخليج'
    words = words.split(', ')
    features.extend(words)

    to_remove = ['country_syria', 'source_Alittihad', 'source_Sabanews']
    [features.remove(x) for x in to_remove]


    results, clf = model_train_val_test(encoded[features])
    return results

Unnamed: 0,model,depth,min_sample_leaf,train_acc,validate_acc,test_acc
0,random_forests,14,1,73.9,73.9,73.9


In [10]:
dict(zip(clf.feature_names_in_, clf.feature_importances_))

{'source_Techreen': 0.3120104383896887,
 'source_Almasryalyoum': 0.07678653480385358,
 'country_emirates': 0.036817746909751294,
 'source_Youm7': 0.07978411103008143,
 'source_SaudiYoum': 0.028452966839296375,
 'country_yemen': 0.027549055815506134,
 'country_ksa': 0.04834361354080233,
 'capture_of_baghdad': 0.017122953378165957,
 'ownership_status': 0.05540847964825931,
 'source_Alqabas': 0.005576187362146184,
 'country_kuwait': 0.005112449417139919,
 'nick_berg': 0.008195269145295128,
 'بوش': 0.05363689134180812,
 'أمريكا': 0.2160629307262095,
 'غوغل': 0.005475695031461952,
 'تشيني': 0.010847001861394855,
 'حرب الخليج': 0.012817674759139277}

In [None]:
clf.get_params()

In [None]:
clf.feature_names_

In [None]:
train, validate, test = split.train_validate_test_split(encoded[features], 'text_label')

x_train = train.drop(columns='text_label')
y_train = train['text_label']

clf.decision_path(x_train)

In [None]:
clf.feature_importances_

In [None]:
features.remove('text_label')
dict(zip(list(clf.feature_importances_), features))



In [None]:
clf.n_features_in_

In [None]:
features

In [None]:
encoded[features]

In [None]:
features.remove('text_label')

In [None]:
scores = {}

for weight in clf.feature_importances_:
    for feat in features:
        scores[feat] = weight
        break
    
scores

In [None]:
encoded.text_label = encoded.text_label.map({'neutral': 0, 'negative': -1, 'positive': 1})



In [None]:
encoded.text_label

In [None]:
train.text_label.value_counts(normalize=True)

In [None]:
encoded.columns

In [None]:
md.decision_tree(train, validate, test, 'text_label')

In [None]:
# 30 day windows on both ends does not improve model performance

In [None]:
encoded

In [None]:
encoded[features].columns

In [None]:
encoded.columns

In [None]:
features = ['source_Techreen', 'country_syria', 'source_Almasryalyoum', 'country_emirates', 'source_Alittihad', 'source_Youm7', 'source_SaudiYoum', 'country_yemen', 'source_Sabanews', 'country_ksa', 'capture_of_baghdad', 'ownership_status', 'source_Alqabas', 'country_kuwait', 'nick_berg', 'text_label']

In [None]:
features = ['source_Techreen', 'country_syria', 'source_Almasryalyoum', 'country_emirates', 'source_Alittihad', 'source_Youm7', 'source_SaudiYoum', 'country_yemen', 'source_Sabanews', 'country_ksa', 'capture_of_baghdad', 'ownership_status', 'source_Alqabas', 'country_kuwait', 'nick_berg', 'text_label']

words = 'بوش, أمريكا, غوغل, تشيني, حرب الخليج'
words = words.split(', ')
features.extend(words)

In [None]:
features.extend(words)

In [None]:
encoded[features]

In [None]:
features.remove('America')

In [None]:
features

In [None]:
from sklearn.svm import SVC

x_train = train.drop(columns='text_label')
y_train = train['text_label']

x_validate = validate.drop(columns='text_label')
y_validate = validate['text_label']

clf = SVC(kernel='poly', degree=2)
clf.fit(x_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.5f}'
      .format(clf.score(x_train, y_train)))
print('Accuracy of Decision Tree classifier on val set: {:.5f}'
      .format(clf.score(x_validate, y_validate)))

In [None]:
,Var,Scores
35,source_Techreen,8943.448879256535
42,country_Syria,8943.448879256423
28,source_Almasryalyoum,2355.0164748339
10,America,1749.3620117708529
43,country_UAE,1254.1639088750942
27,source_Alittihad,1254.1639088750908
36,source_Youm7,1254.1038226182388
23,scaled_date,1028.351322645141
21,Bush,825.5788927612839
34,source_SaudiYoum,563.4215943320206
44,country_Yemen,532.3199952120876
33,source_Sabanews,532.3199952120876
41,country_Saudi_Arabia,359.4754705031025
1,capt_bag,272.2506389745735
7,Google,239.54317229043158
26,ownership_status,223.20219347351065
30,source_Alqabas,176.74692556317044
39,country_Kuwait,176.74692556317044
6,Gulf War,130.54652729478855
9,Cheney,124.84083618600735
2,nick,123.55177378850968

features = ['source_Techreen']

In [None]:
import os
import pandas as pd
import time
from datetime import datetime
import date_fixer
import regex as re
import pandas as pd
from googletrans import Translator
from camel_tools.sentiment import SentimentAnalyzer
from transformers import pipeline


topics = {'America' : 'أمريكا',
            'American' : 'أمريكيّ',
            'American (f)' : 'أمريكيّة',
            'American (pl)' : 'أمريكيّين',
            'The United States' : 'الولايات المتحدة',
            'The United States' : 'دول موحّدة',
            'Washington' : 'واشنطن',
            'Bush' : 'بوش',
            'Obama' : 'أوباما',
            'Cheney' : 'تشيني',
            'Clinton' : 'كلينتون',
            'Osama Bin Laden' : 'أسامة بن لادن',
            'Al Gore' : 'آل غور',
            'World Trade Center' : 'مركز التجارة العالمي',
            '9/11' : '9/11',
            'September 11' : '11 سبتمبر',
            'Gulf War' : 'حرب الخليج',
            'Google' : 'غوغل',
            'Facebook' : 'فيسبوك',
            'Al Qaida' : 'القاعدة'}


def flip_key_value_pairs(dicts):
    res = dict((v,k) for k,v in dicts.items())
    return res

def make_eng_tags(df_tags):
    eng_tags =[]
    rev_topics = flip_key_value_pairs(topics)
    for key, value in rev_topics.items():
        if key in df_tags:
            eng_tags.append(value)
    return eng_tags


""" 
Change name to which file you're doing sentiment analysis on, NOT including the '.csv' 
Output file will be 'labeled_<filename>.csv'
"""

msa = pipeline('text-classification', model="CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")



def load_and_label_df(name):
    path = 'split_articles/'
    df = load_csv(path + name)
    print(f'loaded {name}')
    print('labeling/scoring...')
    df = create_labels_scores(df, name)
    print('done labeling/scoring!')
    return df

def load_csv(filename):
    df = pd.read_csv(filename)
    return df

def make_msa(df_text):
    
    try:
        done = msa(df_text)
        return done
    except:
        
        try:
            first_half = msa(df_text[:round(len(df_text)/2)]) 
            second_half = msa(df_text[round(len(df_text)/2):])
            if first_half[0]['label'] == second_half[0]['label']:
                label = first_half[0]['label']
                score = (test_1[0]['score'] + test_2[0]['score'])/2
            done = [{'label': label, 'score': score}]
            return done
        except:
        
            try:
                beginning = msa(df_text[:round(len(df_text)/3)]) 
                middle = msa(df_text[round(len(df_text)/3):round(len(df_text)*2/3)])
                end = msa(df_text[round(len(df_text)*2/3):])

                if (beginning[0]['label'] == middle[0]['label']) and (beginning[0]['label'] == end[0]['label']):
                    label = first_half[0]['label']
                    score = (beginning[0]['score'] + middle[0]['score'] + end[0]['score'])/3
                    done = [{'label': label, 'score': score}]
                    return done
                else:
                    return 'sentiment of parts not equal'
            except:
                return '3 is not enough'
        
def analyze_text(df):
    scores = []
    print('analyzing_texts')
    scores = [make_msa(val) for val in df.text.values]
    return scores

def analyze_headline(df):
    print('analyzing headlines')
    headline_scores = [make_msa(val) for val in df.headline.values]
    return headline_scores

def label_and_scores(msa_scores):
    labels = []
    scores = []
    for val in msa_scores:
        try:
            labels.append(val[0]['label'])
            scores.append(val[0]['score'])
        except:
            labels.append(False)
            scores.append(False)

    return labels, scores

def create_labels_scores(df, name):
    text_scores = analyze_text(df)
    labels, scores = label_and_scores(text_scores)
    df['text_label'] = labels
    df['text_score'] = scores

    headline_scores = analyze_headline(df)
    labels, scores = label_and_scores(headline_scores)
    df['headline_label'] = labels
    df['headline_score'] = scores

    # CHANGE 'BLOCK_NAME' TO WHATEVER YOU WANT
    df.to_csv('labeled_split_articles/labeled_'+ name + '.csv', index=False)

    return df

def split_tons_of_csvs():
    df = pd.read_csv('C:/Users/kyleg/codeup/all_articles.csv')
    
    end_point = 'C:/Users/kyleg/codeup/split_articles/'
    
    df_shape = df.shape[0]
    
    one_thousandth = int(df_shape/1000)
    
    for i in range(0, df_shape, one_thousandth):
        copy = df.iloc[i:i+one_thousandth].copy()
        copy.to_csv(end_point+str(i) + '.csv', index=False)
        

def make_random_indexes(list_of_files):
    unique_ = []
    length = len(list_of_files)
    unique_indexes= np.random.randint(0, length, size=length**2 , dtype=int)
    [unique_.append(val) for val in unique_indexes if val not in unique_]

    return unique_


def split_and_process_articles():
    labeled = 'labeled_split_articles'
    unlabeled = 'split_articles'

    os.listdir(labeled)

    unlabeled_files = os.listdir(unlabeled)
    labeled_files = []

    for path in os.listdir(labeled):
        labeled_files.append(re.search('\d*\.', path).group()[:-1])

    print(labeled_files)


    ul_files = unlabeled_files[:]
    files_found  = []
    for p in labeled_files:
        for pth in unlabeled_files:
            if p+'.csv' == pth:
                ul_files.remove(pth)

    path = 'split_articles/'
    unique_indexes  = make_random_indexes(ul_files)
    print(unique_indexes)
    for i in unique_indexes:
        load_and_label_df(ul_files[i])

def within_30_days(df_dateline, date):
    
    if (df_dateline - date).days < 30 and (df_dateline - date).days > -30:
        return 1
    else:
        return 0

def create_date_features(df):
    important_dates = {
        'september_11th': pd.to_datetime('09-11-2001'),
        'capture_of_baghdad': pd.to_datetime('04-09-2003'),
        'nick_berg': pd.to_datetime('05-12-2004'),
        'iran_nulcear': pd.to_datetime('08-30-2006'),
        'arab_spring': pd.to_datetime('12-20-2011')
    }

    for event, date in important_dates.items():
        df[event] = df.dateline.apply(within_30_days)
        
    return df


def update_csv():
    waiting = False

    while True:
        time.sleep(5)
        curr_time = pd.Timestamp.now()

        if curr_time.hour % 2 != 0:
            waiting = False

        if curr_time.hour % 2 == 0 and not waiting:
            print('updating csv...')
            all_dfs = []
            path = 'C:/Users/kyleg/codeup/labeled_split_articles/'
            parent = 'E:/drive/arabic_NLP/'
            for fi in os.listdir(path):
                all_dfs.append(pd.read_csv(path + fi))

            daily_csv = pd.concat(all_dfs).reset_index()
            print('fixing dates...')
            daily_csv = date_fixer.make_datetime(daily_csv)
            print('writing csv...')
            daily_csv.to_csv(parent + f'daily{curr_time.month}_{curr_time.day}.csv', index=False)
            print(f'csv written. updating in google drive as of {curr_time.hour}:{curr_time.min}')
            waiting = True
    
def techreen_date(date_entry):
    '''
    This function takes in a date entry and applies a regex search to isolate just the relevant date info
    and returns it out.
    '''
    exp = r'(\d?\d)/(\d?\d)/(\d\d\d\d)'
    match = re.search(exp, str(date_entry))
    day =  match[1]
    month = match[2]
    year = match[3]
    date = year+ '-' + month + '-' + day
    return date

def saudiyoum_date(date_entry):
    '''
    This function takes in a date entry and applies a regex search to isolate just the relevant date info
    and returns it out.
    '''
    try:
        exp = r'(2\d\d\d)-(\d?\d)-(\d?\d)'
        match = re.search(exp, str(date_entry))
        day = match[3]
        month = match[2]
        year = match[1]
        date = year+ '-' + month + '-' + day
        return date
    except:
        exp = r'(\d\d\d\d)/(\d?\d)/(\d?\d)'
        match = re.search(exp, str(date_entry))
        day =  match[3]
        month = match[2]
        year = match[1]
        date = year+ '-' + month + '-' + day
        return date

def youm7_date(date_entry, months_map):
    exp = r'(\d?\d)(.*)(\d\d\d\d)'
    match = re.search(exp, date_entry)
    day =  match[1]
    month = match[2].strip()
    year = match[3]
    month = months_map[month]
    date = year+ '-' + month + '-' + day
    return date

def alittihad_date(date_entry, months_map):
    exp = r'(\d?\d)(.*)(\d\d\d\d)'
    match = re.search(exp, date_entry)
    day =  match[1]
    month = match[2].strip()
    year = match[3]
    month = months_map[month]
    date = year+ '-' + month + '-' + day
    return date

def almustaqbal_date(date_entry):
    arabic_months_map = {   'كانون الثاني':'01',
                            'شباط':'02',
                            'آذار':'03',
                            'نيسان':'04',
                            'أيار':'05',
                            'حزيران':'06',
                            'تموز':'07',
                            'آب':'08',
                            'أيلول':'09',
                            'تشرين الأول':'10',
                            'تشرين الثاني':'11',
                            'كانون الأول':'12'}
    exp = r'(\d?\d)\s(.*)\s(\d\d\d\d).+العدد'
    match = re.search(exp, date_entry)
    day =  match[1]
    month = match[2].strip()
    year = match[3]
    month = arabic_months_map[month]
    date = year+ '-' + month + '-' + day
    return date

def ryiadh_date(date_entry, months_map):
    try:
        exp = r'-\s?(\d?\d)\s?(.+)\s?(\d\d\d\d)\s?م'
        match = re.search(exp, date_entry)
        day =  match[1]
        month = match[2].strip()
        year = match[3]
        month = months_map[month]
        date = year+ '-' + month + '-' + day
        return date

    except:
        try:
            exp = r'(\d?\d)/(\d?\d)/(\d\d\d\d)'
            match = re.search(exp, date_entry)
            day =  match[1]
            month = match[2].strip()
            year = match[3]
            date = year+ '-' + month + '-' + day
            return date
        except:
            try:
                exp = r'(\d\d\d\d)-(\d?\d)-(\d?\d)'
                match = re.search(exp, date_entry)
                day =  match[3]
                month = match[2].strip()
                year = match[1]
                date = year+ '-' + month + '-' + day
                return date
            except:
                return pd.NaT

def alqabas_date(date_entry):
    exp = r'(\d\d\d\d)/(\d?\d)/(\d?\d)'
    match = re.search(exp, date_entry)
    day =  match[3]
    month = match[2]
    year = match[1]
    date = year+ '-' + month + '-' + day
    return date

def almasryalyoum_date(date_entry):
    exp = r'(\d?\d)/(\d?\d)/(\d\d\d\d)'
    match = re.search(exp, date_entry)
    day =  match[1]
    month = match[2]
    year = match[3]
    date = year+ '-' + month + '-' + day
    return date

def sabanews_date(date_entry, months_map):
    exp = r'(\d?\d)/(.+)/(\d\d\d\d)'
    match = re.search(exp, date_entry)
    day =  match[1]
    month = match[2].strip()
    month = months_map[month]
    year = match[3]
    date = year+ '-' + month + '-' + day
    return date

def echoroukonline_date(date_entry):
    exp = r'(\d\d\d\d)/(\d?\d)/(\d?\d)'
    match = re.search(exp, date_entry)
    day =  match[3]
    month = match[2]
    year = match[1]
    date = year+ '-' + month + '-' + day
    return date

def make_datetime(dataframe):
    # This function works, ignore the one above
    df = dataframe.dropna()
    months_map = {'يناير':'01',
        'ينابر':'01',
        'فبراير':'02',
        'مارس':'03',
        'أبريل':'04',
        'ابريل':'04',
        'مايو':'05',
        'يونيو':'06',
        'يوليو':'07',
        'أغسطس':'08',
        'اغسطس':'08',
        'سبتمبر':'09',
        'أكتوبر':'10',
        'اكتوبر':'10',
        'نوفمبر':'11',
        'ديسمبر':'12',
        'إبريل':'04',
        'ماي':'05',
        'يونيه':'06',
        'يوليه':'07'}
    m_maps = months_map
    date_list = []

    for i in range(len(df)):

        if df.iloc[i].source == 'SaudiYoum':
            date_list.append(saudiyoum_date(df.iloc[i]['dateline']))
            #print(df.id.iloc[i] + ' successful')
            
        elif df.iloc[i].source == 'Techreen':
            date_list.append(techreen_date(df.iloc[i]['dateline']))
            #print(df.id.iloc[i] + ' successful')

        elif df.iloc[i].source == 'Youm7':
            date_list.append(youm7_date(df.iloc[i]['dateline'], m_maps))
            #print(df.id.iloc[i] + ' successful')

        elif df.iloc[i].source == 'Alittihad':
            date_list.append(alittihad_date(df.iloc[i]['dateline'], m_maps))
            #print(df.id.iloc[i] + ' successful')

        elif df.iloc[i].source == 'Almustaqbal':
            date_list.append(almustaqbal_date(df.iloc[i]['dateline']))
            #print(df.id.iloc[i] + ' successful')

        elif df.iloc[i].source == 'Ryiadh':
            date_list.append(ryiadh_date(df.iloc[i]['dateline'], m_maps))
            #print(df.id.iloc[i] + ' successful')

        elif df.iloc[i].source == 'Alqabas':
            date_list.append(alqabas_date(df.iloc[i]['dateline']))
            #print(df.id.iloc[i] + ' successful')

        elif df.iloc[i].source == 'Almasryalyoum':
            date_list.append(almasryalyoum_date(df.iloc[i]['dateline']))
            #print(df.id.iloc[i] + ' successful')

        elif df.iloc[i].source == 'Sabanews':
            date_list.append(sabanews_date(df.iloc[i]['dateline'], m_maps))
            #print(df.id.iloc[i] + ' successful')
            
        elif df.iloc[i].source == 'Echoroukonline':
            date_list.append(echoroukonline_date(df.iloc[i]['dateline']))
            #print(df.id.iloc[i] + ' successful')
    df['dateline'] = pd.to_datetime(date_list)
    df = df.dropna()
    return df

def using_ahocorasick(col, lst):
    A = ahocorasick.Automaton(ahocorasick.STORE_INTS)
    for word in lst:
        A.add_word(word.lower())
    A.make_automaton() 
    col = col.astype(str)
    col = col.str.lower()
    mask = col.apply(lambda x: bool(list(A.iter(x))))
    tags = col.apply(lambda x: list(A.iter(x)))
    return mask, tags

def look_for_words_in_text(df_text):
    
    topics = {'America' : 'أمريكا',
            'American' : 'أمريكيّ',
            'American (f)' : 'أمريكيّة',
            'American (pl)' : 'أمريكيّين',
            'The United States' : 'الولايات المتحدة',
            'The United States' : 'دول موحّدة',
            'Washington' : 'واشنطن',
            'Bush' : 'بوش',
            'Obama' : 'أوباما',
            'Cheney' : 'تشيني',
            'Clinton' : 'كلينتون',
            'Osama Bin Laden' : 'أسامة بن لادن',
            'Al Gore' : 'آل غور',
            'World Trade Center' : 'مركز التجارة العالمي',
            '9/11' : '9/11',
            'September 11' : '11 سبتمبر',
            'Gulf War' : 'حرب الخليج',
            'Google' : 'غوغل',
            'Facebook' : 'فيسبوك',
            'Al Qaida' : 'القاعدة'}
    
    topics = flip_key_value_pairs(topics)
    
    tags = []
    for key in topics.keys():
        if key in df_text:
            tags.append(key)
            
    return tags

def make_relevant_tagged_df(df):
    topics = {'America' : 'أمريكا',
            'American' : 'أمريكيّ',
            'American (f)' : 'أمريكيّة',
            'American (pl)' : 'أمريكيّين',
            'The United States' : 'الولايات المتحدة',
            'The United States' : 'دول موحّدة',
            'Washington' : 'واشنطن',
            'Bush' : 'بوش',
            'Obama' : 'أوباما',
            'Cheney' : 'تشيني',
            'Clinton' : 'كلينتون',
            'Osama Bin Laden' : 'أسامة بن لادن',
            'Al Gore' : 'آل غور',
            'World Trade Center' : 'مركز التجارة العالمي',
            '9/11' : '9/11',
            'September 11' : '11 سبتمبر',
            'Gulf War' : 'حرب الخليج',
            'Google' : 'غوغل',
            'Facebook' : 'فيسبوك',
            'Al Qaida' : 'القاعدة'}


    topics = flip_key_value_pairs(topics)
    mask, tags = using_ahocorasick(df.text, list(topics.keys()))

    copied = df[mask].copy()

    return copied, tags

def ramadan(df):
    date_list = []
    for i in range(len(df)):
        if (
        (df.dateline[i] >=dt.datetime(2001, 11, 17)) & (df.dateline[i] <= dt.datetime(2001, 12, 16))
        or (df.dateline[i] >=dt.datetime(2002, 11, 6)) & (df.dateline[i] <= dt.datetime(2002, 12, 5))
        or (df.dateline[i] >=dt.datetime(2003, 10, 27)) & (df.dateline[i] <= dt.datetime(2003, 11, 25))
        or (df.dateline[i] >=dt.datetime(2004, 10, 16)) & (df.dateline[i] <= dt.datetime(2004, 11, 13))
        or (df.dateline[i] >=dt.datetime(2005, 10, 5)) & (df.dateline[i] <= dt.datetime(2005, 11, 2))
        or (df.dateline[i] >=dt.datetime(2006, 9, 24)) & (df.dateline[i] <= dt.datetime(2006, 10, 23))
        or (df.dateline[i] >=dt.datetime(2007, 9, 13)) & (df.dateline[i] <= dt.datetime(2007, 10, 12))
        or (df.dateline[i] >=dt.datetime(2008, 9, 2)) & (df.dateline[i] <= dt.datetime(2008, 10, 1))
        or (df.dateline[i] >=dt.datetime(2009, 8, 22)) & (df.dateline[i] <= dt.datetime(2009, 9, 20))
        or (df.dateline[i] >=dt.datetime(2010, 8, 11)) & (df.dateline[i] <= dt.datetime(2010, 9, 9))
        or (df.dateline[i] >=dt.datetime(2011, 8, 1)) & (df.dateline[i] <= dt.datetime(2011, 8, 30))
        or (df.dateline[i] >=dt.datetime(2012, 7, 20)) & (df.dateline[i] <= dt.datetime(2012, 8, 18))
        or (df.dateline[i] >=dt.datetime(2013, 7, 9)) & (df.dateline[i] <= dt.datetime(2013, 8, 7))
        or (df.dateline[i] >=dt.datetime(2014, 6, 29)) & (df.dateline[i] <= dt.datetime(2014, 7, 28))):
            date_list.append(1)
        else:
            date_list.append(0)
        
    return date_list

def encode_tags(df):

    list_of_tags = []
    for tag in df.tags.values:
        list_of_tags.extend([val[1:-1] for val in tag[1:-1].split(', ')])

    tag_list = list(set(list_of_tags))

    for tag in tag_list:
        df[tag] = 0

    for i, tag in enumerate(df.tags):
        for t in tag_list:
            if t in tag:
                df[t].iloc[i] = 1
                
    return df


def create_encoded_df(df):
    '''Creates an encoded df from the original df, including bigrams and trigrams, that is 
    compatible with modeling.'''
    tfidf = TfidfVectorizer(ngram_range=(1,3))
    tfidfs = tfidf.fit_transform(df.dropna().lemm.values)

    tfidf_df = pd.DataFrame(tfidfs.todense(), columns=tfidf.get_feature_names())
    col = pd.DataFrame({'programming_language_99': df.dropna().reset_index().drop(columns='index').language.values})

    encoded_df = pd.concat([tfidf_df, col], axis=1)

    return encoded_df