In [1]:
import pandas as pd
import numpy as np
import os
from deep_translator import GoogleTranslator

In [2]:
def get_csv_files(folder_path):
    return [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".csv")]

In [3]:
# translating all the tweets to a english language   
def translate_text_to_english(text):
    try:
        translated_text = GoogleTranslator(source='auto', target='en').translate(text)
        print("Translated text: ", translated_text)
        
        return translated_text
    
    except Exception as e:
        print(f"Translation error: {e}")
        return text

In [4]:
# removing the user tags through regex
import re
def remove_tags(text):
    return re.sub(r"@[a-zA-Z0-9_]+|#[a-zA-Z0-9_]+", "", text)
    

In [5]:
# in this function we have combined the data from different csv files and removed the duplicate tweets as well
def data_collection(csv_file_lists):
    dataframes = []
    for file in csv_file_lists:
        try:
            df = pd.read_csv(file)
            dataframes.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

    if not dataframes:
        
        print("No valid CSV files found.")
        return

    merged_df = pd.concat(dataframes, ignore_index = True)
    if 'tweet' in merged_df.columns:
        merged_df = merged_df.drop_duplicates(subset=['tweet'], keep='first')

    merged_dict = {col: merged_df[col].tolist() for col in merged_df.columns}

    final_df = pd.DataFrame(merged_dict)
    return final_df


   

In [6]:
folder_path = "../Datasets"
data_files = get_csv_files(folder_path)
merged_data = data_collection(data_files)
merged_data.head()

Unnamed: 0.1,Unnamed: 0,date,time,id,tweet,language,replies_count,retweets_count,likes_count,conversation_id,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,0.0,2022-08-20,10:08:03,1560810865359835136,@Idolmaker66 @6abc They can't prove anything a...,en,0,0,0,,...,,,,,,,,,,
1,1.0,2022-08-20,10:07:59,1560810847911530497,Health official says risk remains low as Manit...,en,0,0,0,,...,,,,,,,,,,
2,2.0,2022-08-20,10:07:52,1560810818589130752,Penn State University Student Tests Positive F...,en,0,0,0,,...,,,,,,,,,,
3,3.0,2022-08-20,10:07:36,1560810752985903105,Viruela del mono: Estos son los tratamientos d...,es,0,0,0,,...,,,,,,,,,,
4,4.0,2022-08-20,10:07:30,1560810727526522880,@dumpfacebooknow @brianstelter @CNN 😂 go get m...,en,0,0,0,,...,,,,,,,,,,


In [7]:
merged_data.shape

(83438, 37)

In [8]:
# here we removed the columns with nan values
merged_data = merged_data.dropna(axis = 1)
merged_data.head()

Unnamed: 0,date,time,id,tweet,language,replies_count,retweets_count,likes_count
0,2022-08-20,10:08:03,1560810865359835136,@Idolmaker66 @6abc They can't prove anything a...,en,0,0,0
1,2022-08-20,10:07:59,1560810847911530497,Health official says risk remains low as Manit...,en,0,0,0
2,2022-08-20,10:07:52,1560810818589130752,Penn State University Student Tests Positive F...,en,0,0,0
3,2022-08-20,10:07:36,1560810752985903105,Viruela del mono: Estos son los tratamientos d...,es,0,0,0
4,2022-08-20,10:07:30,1560810727526522880,@dumpfacebooknow @brianstelter @CNN 😂 go get m...,en,0,0,0


In [9]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83438 entries, 0 to 83437
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   date            83438 non-null  object
 1   time            83438 non-null  object
 2   id              83438 non-null  int64 
 3   tweet           83438 non-null  object
 4   language        83438 non-null  object
 5   replies_count   83438 non-null  int64 
 6   retweets_count  83438 non-null  int64 
 7   likes_count     83438 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 5.1+ MB


In [10]:
tweets = merged_data['tweet']
languages = merged_data['language']
tweets = {'tweet' : tweets, 'language' : languages}

In [11]:
# creating a data frame for the tweets
tweets_df = pd.DataFrame(tweets)
tweets_df

Unnamed: 0,tweet,language
0,@Idolmaker66 @6abc They can't prove anything a...,en
1,Health official says risk remains low as Manit...,en
2,Penn State University Student Tests Positive F...,en
3,Viruela del mono: Estos son los tratamientos d...,es
4,@dumpfacebooknow @brianstelter @CNN 😂 go get m...,en
...,...,...
83433,"Whether we talking bout #COVID or #Monkeypox, ...",en
83434,The World Health Organization has declared mon...,en
83435,"Ukraine, COVID, And Monkeypox: Biden Asks Cong...",en
83436,Biden Regime Begs For Another $47B… For Monkey...,en


In [12]:
# checking the number of language in the dataset
tweets_df['language'].unique()

array(['en', 'es', 'pt', 'fr', 'ta', 'tl', 'de', 'qme', 'tr', 'da', 'ja',
       'ro', 'in', 'qht', 'pl', 'ar', 'el', 'nl', 'zxx', 'lt', 'te', 'no',
       'ca', 'ur', 'it', 'und', 'sv', 'fa', 'fi', 'hi', 'et', 'th', 'cy',
       'gu', 'kn', 'ml', 'vi', 'bn', 'or', 'zh', 'mr', 'iw', 'ru', 'cs',
       'sl', 'hu', 'ko', 'pa', 'ht', 'uk', 'bg', 'ne', 'si', 'lv', 'sr',
       'qam', 'is', 'eu'], dtype=object)

In [13]:
def demojize(tweet):
    emojis = re.compile("["
        u"\U0001F600-\U0001F64F"  
        u"\U0001F300-\U0001F5FF" 
        u"\U0001F680-\U0001F6FF" 
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags = re.UNICODE)
    cleaned_text = emojis.sub(r'', tweet)
    return cleaned_text

In [14]:
def remove_urls(text):
    clean_text = re.sub(r'http\S+','',text).strip()
    return clean_text

In [15]:
import string
def remove_numerals_and_punctuations(text):
    numerals = re.sub(r'\d+','',text)
    clean_text_punctuations = numerals.translate(str.maketrans('','',string.punctuation)).strip()
    return clean_text_punctuations

In [16]:
remove_tags_df = tweets_df['tweet'].apply(remove_tags)
removed_emojis = remove_tags_df.apply(lambda x: demojize(x))
removed_urls = removed_emojis.apply(lambda x: remove_urls(x))
clean_text = removed_urls.apply(lambda x : remove_numerals_and_punctuations(x))
clean_text

0        They cant prove anything about covid or monkeypox
1        Health official says risk remains low as Manit...
2        Penn State University Student Tests Positive F...
3        Viruela del mono Estos son los tratamientos di...
4                                 go get monkey pox flamer
                               ...                        
83433    Whether we talking bout  or  Black amp Brown c...
83434    The World Health Organization has declared mon...
83435    Ukraine COVID And Monkeypox Biden Asks Congres...
83436    Biden Regime Begs For Another B… For Monkeypox...
83437    Monkeypox  dos casos suspeitos foram confirmad...
Name: tweet, Length: 83438, dtype: object

In [17]:
# creating dataframe after removing tags
tweets_df = pd.DataFrame({'tweet' : clean_text, 'language' : languages})
tweets_df

Unnamed: 0,tweet,language
0,They cant prove anything about covid or monkeypox,en
1,Health official says risk remains low as Manit...,en
2,Penn State University Student Tests Positive F...,en
3,Viruela del mono Estos son los tratamientos di...,es
4,go get monkey pox flamer,en
...,...,...
83433,Whether we talking bout or Black amp Brown c...,en
83434,The World Health Organization has declared mon...,en
83435,Ukraine COVID And Monkeypox Biden Asks Congres...,en
83436,Biden Regime Begs For Another B… For Monkeypox...,en


In [25]:
tweets_dframe = pd.read_csv('../Datasets/filtered_data.csv')
tweets_dframe.head()

Unnamed: 0,tweets,language
0,Monkey pox,en
1,Of course because a new scam is here the monke...,en
2,Monkeypox can be virtually avoided by less ran...,en
3,An event is not how monkey pox is spread Just ...,en
4,To all of you out there dont ask the source Iv...,en


In [None]:

    
    
    
# word_tokens = word_tokenize(tweets_df['tweet'])
# print(word_tokens)
