In [4]:
import numpy as np
import pandas as pd
from os import listdir
from os.path import isfile, join
import os

In [5]:
%config Completer.use_jedi = False

## Data loading

In [12]:
curr_dir = os.getcwd()
print(f"Your current directory: {curr_dir}")

path = "../data/"
files = [file for file in listdir(path) if isfile(join(path, file))]

Your current directory: /home/jloesch30/school/DeepLearningFinalProject/data engineering


In [13]:
# all files in the data folder
for file in files:
    print(file)

UkraineCombinedTweetsDeduped_MAR01.csv.gzip
UkraineCombinedTweetsDeduped20220227-131611.csv.gzip
UkraineCombinedTweetsDeduped_MAR04.csv.gzip
UkraineCombinedTweetsDeduped_MAR07.csv.gzip
UkraineCombinedTweetsDeduped_MAR05.csv.gzip
UkraineCombinedTweetsDeduped_MAR03.csv.gzip
UkraineCombinedTweetsDeduped_FEB28_part1.csv.gzip
UkraineCombinedTweetsDeduped_FEB28_part2.csv.gzip
UkraineCombinedTweetsDeduped_MAR06.csv.gzip
UkraineCombinedTweetsDeduped_FEB27.csv.gzip
UkraineCombinedTweetsDeduped_MAR02.csv.gzip


In [18]:
"""
Extract csv files
"""

import gzip
import shutil
import re

name_pattern = r"^[A-Za-z0-9\_\-]+"

path = "../data/"

for file in files:
   with gzip.open(path + file, "rt") as f_in:
    match = re.findall(name_pattern, file)[0]
    data = f_in.read()
    with open(path + "csv/" + match + ".csv", "wt+") as f_out:
        f_out.write(data)

In [25]:
import csv
import warnings
warnings.simplefilter(action='ignore')

filename27_1 = r"../data/csv/UkraineCombinedTweetsDeduped_FEB27.csv"
filename28_1 = r"../data/csv/UkraineCombinedTweetsDeduped_FEB28_part1.csv"
filename27_2 = r"../data/csv/UkraineCombinedTweetsDeduped20220227-131611.csv"
filename28_2 = r"../data/csv/UkraineCombinedTweetsDeduped_FEB28_part2.csv"
filename01  = r"../data/csv/UkraineCombinedTweetsDeduped_MAR01.csv"
filename02  = r"../data/csv/UkraineCombinedTweetsDeduped_MAR02.csv"

df27_1 = pd.read_csv(filename27_1, index_col=0,encoding='utf-8', quoting=csv.QUOTE_ALL)
df28_1 = pd.read_csv(filename28_1,  index_col=0,encoding='utf-8', quoting=csv.QUOTE_ALL)
df27_2 = pd.read_csv(filename27_2,  index_col=0,encoding='utf-8', quoting=csv.QUOTE_ALL)
df28_2 = pd.read_csv(filename28_2,  index_col=0,encoding='utf-8', quoting=csv.QUOTE_ALL)
df_0301 = pd.read_csv(filename01,  index_col=0,encoding='utf-8', quoting=csv.QUOTE_ALL)
df_0302 = pd.read_csv(filename02,  index_col=0,encoding='utf-8', quoting=csv.QUOTE_ALL)

In [26]:
df27 = pd.concat([df27_1,df27_2])
df28 = pd.concat([df28_1,df28_2])
df = pd.concat([df27,df28,df_0301,df_0302])

print(len(df27), 'tweets from 27th FEB')
print(len(df28), 'tweets from 28th FEB')
print(len(df_0301),'tweets from 1st march')
print(len(df),'tweets from the last 3 days')

1586367 tweets from 27th FEB
378171 tweets from 28th FEB
409279 tweets from 1st march
2790848 tweets from the last 3 days


## Labeling and Feature engineering

In [56]:
df_ru = df[df["language"] == "ru"]

import re
import string

def clean_text(text):
    #Make text lowercase   
    text = text.lower()
    # remove new line characters
    text = text.replace('\n', " ")
    #remove text in square brackets
    text = re.sub(r'\[.*?\]', '', text)
    #remove punctuation   
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text) 
    #remove words containing numbers
    text = re.sub(r'\w*\d\w*', '', text)
    #remove links   
    text = re.sub(r'http\S+', '', text)
    #remove emojis    
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    text = regrex_pattern.sub(r'', text)
    return text

df_cleaned = pd.DataFrame(df_ru.text.apply(lambda x: clean_text(x)))

# store all values of df_cleaned in a list
lst_cleaned = df_cleaned['text'].to_list()

In [59]:
"""
Instead of iterating through and generating new calls, I believe doing it this way batches the requests and uses one access token
"""

# send list values into the translator
from googletrans import Translator
translator = Translator()
translations = translator.translate(lst_cleaned, dest='en')

In [70]:
for idx, text in enumerate(translations):
    if idx == 5:
        break
    print(f"original: {text.origin}\nTranslated: {text.text}\n{'-'*20}")

original: russian invasion on ukraine  ukraineinvasion харьков район даниловка кадры пожара после взрыва нефтебазы 
Translated: russian invasion in ukraine ukraine invasion kharkov danilovka district footage of the fire after the explosion
--------------------
original: русский сдавайся в плен  нет смысла умереть за палацы и яхты путина  тебя не убьют и будут хорошо кормить  даже разрешат мамке позвонить  ukrainewar ukrainerussia news putin swift россия москва война украина україна india france london uk usa japan brazil 
Translated: russian surrender it makes no sense to die for Putin's palaces and yachts they won't kill you and they will feed you well even let your mother call ukrainewar ukrainerussia news putin swift russia moscow war ukraine india france london uk usa japan brazil
--------------------
original: morgenshterh дай пинка durov чё он не банит в телеге каналы призывающие ставить метки для ударов и остальных прокремлевских шлюх  stoprussia ukraine helpukraine
Translated: 

In [67]:
lst_translations = []
for translation in translations:
    lst_translations.append(translation.text)

In [72]:
# replace df_clean_ru text column with new translations

df_cleaned['text'] = lst_translations

In [73]:
df_cleaned['text'].head(3)

1143139    russian invasion in ukraine ukraine invasion k...
1143146    russian surrender it makes no sense to die for...
1143405    morgenshterh give me a kick durov why doesn't ...
Name: text, dtype: object

In [74]:
# export dataframe of translated russian text
df_cleaned.to_csv("./translated_russian.csv")

In [79]:
"""
This message appears a lot of times in the dataset FYI
"""

print(df_cleaned.iloc[583])
print()
print(df_cleaned.iloc[585])

text    guys i really need help please maximum repost ...
Name: 1312641, dtype: object

text    guys i really need help please maximum repost ...
Name: 1312727, dtype: object


In [81]:
repeated_string = df_cleaned[df_cleaned['text'] == df_cleaned.iloc[583].text]
len(repeated_string)

137