In [2]:
import pandas as pd
import numpy as np

# Twitter Game Dataset

In [5]:
twitter_dataset = pd.read_csv('Datasets/twitter_training.csv')
twitter_dataset.columns

Index(['2401', 'Borderlands', 'Positive',
       'im getting on borderlands and i will murder you all ,'],
      dtype='object')

In [6]:
twitter_dataset = twitter_dataset.drop(twitter_dataset.columns[0:2], axis=1)
twitter_dataset.columns

Index(['Positive', 'im getting on borderlands and i will murder you all ,'], dtype='object')

In [7]:
twitter_dataset.columns = ['label', 'text']
twitter_dataset.head()

Unnamed: 0,label,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [8]:
print(twitter_dataset['label'].unique())

['Positive' 'Neutral' 'Negative' 'Irrelevant']


In [10]:
twitter_dataset = twitter_dataset[twitter_dataset['label'] != 'Irrelevant']
print(twitter_dataset['label'].unique())

['Positive' 'Neutral' 'Negative']


In [11]:
# Create a mapping dictionary for label conversion
label_mapping = {
    'Positive': 1,
    'Negative': 0,
    'Neutral': 2
}

# Convert labels using the mapping
twitter_dataset['label'] = twitter_dataset['label'].map(label_mapping)
print(twitter_dataset['label'].unique())

[1 2 0]


In [22]:
twitter_dataset.isnull().sum()

label      0
text     571
dtype: int64

In [23]:
twitter_dataset.dropna(inplace=True)
print(twitter_dataset.shape)

(61120, 2)


In [24]:
# Save the processed dataset
twitter_dataset.to_csv('Datasets/processed_twitter_dataset.csv', index=False)

# Display the shape of the final dataset
print("Final dataset shape:", twitter_dataset.shape)

Final dataset shape: (61120, 2)


# Tweet Sentiment Analysis

In [14]:
tweet_dataset = pd.read_csv('Datasets/TSA.csv')
tweet_dataset.columns

Index(['Text', 'Label'], dtype='object')

In [17]:
print(tweet_dataset['Label'].unique())

['positive' 'negative' 'neutral']


In [19]:
label_mapping = {
    'positive': 1,
    'negative': 0,
    'neutral': 2
}

tweet_dataset['Label'] = tweet_dataset['Label'].map(label_mapping)
print(tweet_dataset['Label'].unique())

[1 0 2]


In [20]:
# Rename columns to match the format of twitter_dataset
tweet_dataset = tweet_dataset.rename(columns={'Label': 'label', 'Text': 'text'})
print(tweet_dataset.columns)

Index(['text', 'label'], dtype='object')


In [25]:
tweet_dataset.isnull().sum()

text     4
label    0
dtype: int64

In [26]:
tweet_dataset.dropna(inplace=True)
print(tweet_dataset.shape)

(691244, 2)


In [27]:
# Save the processed tweet dataset
tweet_dataset.to_csv('Datasets/processed_tweet_dataset.csv', index=False)

# Display the shape of the final dataset
print("Final tweet dataset shape:", tweet_dataset.shape)


Final tweet dataset shape: (691244, 2)


# French Tweets

In [28]:
french_dataset = pd.read_csv('Datasets/french_tweets.csv')
french_dataset.columns

Index(['label', 'text'], dtype='object')

In [29]:
french_dataset.isna().sum()

label    0
text     0
dtype: int64

In [30]:
# Save the processed French dataset
french_dataset.to_csv('Datasets/processed_french_dataset.csv', index=False)

# Display the shape of the final dataset
print("Final French dataset shape:", french_dataset.shape)


Final French dataset shape: (1526724, 2)


# Portuguese Tweets

In [33]:
p_notheme_tweets = pd.read_csv('C:/Users/chyav/Uni/AAI-510/AAI-510-Project/Datasets/portuguese/NoThemeTweets.csv')
p_tweets_neutral_hash = pd.read_csv('C:/Users/chyav/Uni/AAI-510/AAI-510-Project/Datasets/portuguese/TweetsNeutralHash.csv')
p_tweets_neutral_news = pd.read_csv('C:/Users/chyav/Uni/AAI-510/AAI-510-Project/Datasets/portuguese/TweetsNeutralNews.csv')
p_tweets_theme = pd.read_csv('C:/Users/chyav/Uni/AAI-510/AAI-510-Project/Datasets/portuguese/TweetsWithTheme.csv')

In [36]:
combined_portuguese_tweet_dataset = pd.concat([p_notheme_tweets, p_tweets_neutral_hash, p_tweets_neutral_news, p_tweets_theme])
combined_portuguese_tweet_dataset.head()

Unnamed: 0,id,tweet_text,tweet_date,sentiment,query_used
0,1031761728445530112,@Tixaa23 14 para eu ir :),Tue Aug 21 04:35:39 +0000 2018,Positivo,:)
1,1031761040462278656,@drexalvarez O meu like eu já dei na época :),Tue Aug 21 04:32:55 +0000 2018,Positivo,:)
2,1031760962372689920,Eu só queria conseguir comer alguma coisa pra ...,Tue Aug 21 04:32:37 +0000 2018,Positivo,:)
3,1031760948250456066,:D que lindo dia !,Tue Aug 21 04:32:33 +0000 2018,Positivo,:)
4,1031760895985246208,"@Primo_Resmungao Pq da pr jeito!!é uma ""oferta...",Tue Aug 21 04:32:21 +0000 2018,Positivo,:)


In [38]:
combined_portuguese_tweet_dataset.drop(columns=['id', 'tweet_date', 'query_used'], inplace=True)
combined_portuguese_tweet_dataset.columns

Index(['tweet_text', 'sentiment'], dtype='object')

In [39]:
print(combined_portuguese_tweet_dataset['sentiment'].unique())

['Positivo' 'Negativo' 'Neutro']


In [40]:
label_mapping = {
    'Positivo': 1,
    'Negativo': 0,
    'Neutro': 2
}

combined_portuguese_tweet_dataset['sentiment'] = combined_portuguese_tweet_dataset['sentiment'].map(label_mapping)
print(combined_portuguese_tweet_dataset['sentiment'].unique())

[1 0 2]


In [43]:
combined_portuguese_tweet_dataset = combined_portuguese_tweet_dataset.rename(columns={'sentiment': 'label', 'tweet_text': 'text'})
print(combined_portuguese_tweet_dataset.columns)

Index(['text', 'label'], dtype='object')


In [44]:
combined_portuguese_tweet_dataset.isnull().sum()

text     0
label    0
dtype: int64

In [45]:
# Save the combined Portuguese tweet dataset
combined_portuguese_tweet_dataset.to_csv('C:/Users/chyav/Uni/AAI-510/AAI-510-Project/Datasets/portuguese/combined_portuguese_tweets.csv', index=False)

# Print dataset info
print("Dataset shape:", combined_portuguese_tweet_dataset.shape)
print("\nLabel distribution:")
print(combined_portuguese_tweet_dataset['label'].value_counts())


Dataset shape: (900688, 2)

Label distribution:
label
0    551554
1    295851
2     53283
Name: count, dtype: int64


# Portuguese Regular Dataset

In [51]:
regular_dataset_path = 'C:/Users/chyav/Uni/AAI-510/AAI-510-Project/Datasets/portuguese/regular'

In [54]:
import os

In [53]:
regular_csv_files = [f for f in os.listdir(regular_dataset_path) if f.endswith('.csv')]

In [56]:
for csv_file in regular_csv_files:
    try:
        # Create variable name from filename (remove .csv extension)
        var_name = csv_file.replace('.csv', '')
        # Load the CSV file with error handling for delimiter issues
        globals()[var_name] = pd.read_csv(
            os.path.join(regular_dataset_path, csv_file),
            sep=None,  # Let pandas detect the separator
            engine='python'  # Use python engine which is more flexible
        )
        print(f"Loaded {csv_file} into variable '{var_name}'")
    except Exception as e:
        print(f"Error loading {csv_file}: {str(e)}")

Loaded Train100.csv into variable 'Train100'
Loaded Train200.csv into variable 'Train200'
Loaded Train300.csv into variable 'Train300'
Loaded Train3Classes.csv into variable 'Train3Classes'
Loaded Train400.csv into variable 'Train400'
Loaded Train50.csv into variable 'Train50'
Loaded Train500.csv into variable 'Train500'
Loaded TrainTema.csv into variable 'TrainTema'


In [59]:
regular_datasets = [globals()[var_name] for var_name in [f.replace('.csv', '') for f in regular_csv_files]]


Combined regular dataset shape: (1700000, 5)


In [68]:
combined_regular_dataset = pd.concat(regular_datasets, ignore_index=True)

In [69]:
print("Combined regular dataset shape:", combined_regular_dataset.shape)

Combined regular dataset shape: (1700000, 5)


In [70]:
combined_regular_dataset.columns

Index(['id', 'tweet_text', 'tweet_date', 'sentiment', 'query_used'], dtype='object')

In [71]:
print(combined_regular_dataset['sentiment'].value_counts())

sentiment
1    833334
0    833333
2     33333
Name: count, dtype: int64


In [73]:
combined_regular_dataset.drop(columns=['id', 'tweet_date', 'query_used'], inplace=True)
combined_regular_dataset.columns

Index(['tweet_text', 'sentiment'], dtype='object')

In [77]:
combined_regular_dataset = combined_regular_dataset.rename(columns={'sentiment': 'label', 'tweet_text': 'text'})
print(combined_regular_dataset.columns)

Index(['text', 'label'], dtype='object')


In [78]:
combined_regular_dataset.isna().sum()

text     0
label    0
dtype: int64

In [79]:
combined_regular_dataset.to_csv('C:/Users/chyav/Uni/AAI-510/AAI-510-Project/Datasets/portuguese/combined_regular_dataset.csv', index=False)

In [111]:
combined_portuguese = pd.concat([combined_regular_dataset, combined_portuguese_tweet_dataset], ignore_index=True)

# Save the combined dataset
combined_portuguese.to_csv('C:/Users/chyav/Uni/AAI-510/AAI-510-Project/Datasets/processed_portuguese_dataset.csv', index=False)

# Print shape of combined dataset
print("Combined Portuguese dataset shape:", combined_portuguese.shape)


Combined Portuguese dataset shape: (2600688, 2)


In [112]:
combined_portuguese.isna().sum()

text     0
label    0
dtype: int64

# Nepali

In [85]:
nepali_dataset = pd.read_csv('C:/Users/chyav/Uni/AAI-510/AAI-510-Project/Datasets/nepalimoviereviews.csv')
nepali_dataset.columns

Index(['Reviews', 'Emotion'], dtype='object')

In [86]:
print(nepali_dataset['Emotion'].value_counts())

Emotion
1    367
0    235
Name: count, dtype: int64


In [87]:
nepali_dataset = nepali_dataset.rename(columns={'Emotion': 'label', 'Reviews': 'text'})
print(nepali_dataset.columns)

Index(['text', 'label'], dtype='object')


In [92]:
nepali_dataset.isna().sum()

text     0
label    0
dtype: int64

In [88]:
nepali_dataset.to_csv('C:/Users/chyav/Uni/AAI-510/AAI-510-Project/Datasets/processed_nepali_dataset.csv', index=False)

# Hindi

In [114]:
hindi_dataset = pd.read_csv('C:/Users/chyav/Uni/AAI-510/AAI-510-Project/Datasets/hindi.csv')
hindi_dataset.columns

Index(['text', 'experience'], dtype='object')

In [115]:
hindi_dataset = hindi_dataset.rename(columns={'experience': 'label', 'text': 'text'})
print(hindi_dataset.columns)

Index(['text', 'label'], dtype='object')


In [116]:
print(hindi_dataset['label'].value_counts())

label
2    273
0    240
1    205
Name: count, dtype: int64


In [117]:
label_mapping = {
    2: 1,
    0: 0,
    1: 2
}
hindi_dataset['label'] = hindi_dataset['label'].map(label_mapping)

In [118]:
print(hindi_dataset['label'].value_counts())

label
1    273
0    240
2    205
Name: count, dtype: int64


In [119]:
hindi_dataset.isna().sum()

text     0
label    0
dtype: int64

In [120]:
hindi_dataset.to_csv('C:/Users/chyav/Uni/AAI-510/AAI-510-Project/Datasets/processed_hindi_dataset.csv', index=False)

# Nyka

In [125]:
nyka_dataset = pd.read_csv('C:/Users/chyav/Uni/AAI-510/AAI-510-Project/Datasets/nyka.csv')
nyka_dataset.columns

Index(['content', 'sentiment_labels'], dtype='object')

In [126]:
nyka_dataset = nyka_dataset.rename(columns={'sentiment_labels': 'label', 'content': 'text'})
print(nyka_dataset.columns)

Index(['text', 'label'], dtype='object')


In [127]:
print(nyka_dataset['label'].value_counts())

label
POSITIVE    110458
NEUTRAL      31276
NEGATIVE     13810
Name: count, dtype: int64


In [128]:
label_mapping = {
    "POSITIVE": 1,
    "NEGATIVE": 0,
    "NEUTRAL": 2
}
nyka_dataset['label'] = nyka_dataset['label'].map(label_mapping)
print(nyka_dataset['label'].unique())

[1 0 2]


In [129]:
nyka_dataset.isna().sum()

text     7
label    0
dtype: int64

In [130]:
nyka_dataset.dropna(inplace=True)

In [131]:
nyka_dataset.isna().sum()


text     0
label    0
dtype: int64

In [132]:
print(nyka_dataset['label'].value_counts())

label
1    110453
2     31274
0     13810
Name: count, dtype: int64


In [133]:
nyka_dataset.to_csv('C:/Users/chyav/Uni/AAI-510/AAI-510-Project/Datasets/processed_nyka_dataset.csv', index=False)

# Creating Single Dataset

In [135]:
# Find all processed datasets
processed_datasets = [f for f in os.listdir('C:/Users/chyav/Uni/AAI-510/AAI-510-Project/Datasets') if f.startswith('processed_')]

# Load each dataset into a variable
for dataset in processed_datasets:
    # Create variable name by removing 'processed_' prefix and '.csv' extension
    var_name = dataset.replace('processed_', '').replace('.csv', '')
    # Load dataset into variable
    globals()[var_name] = pd.read_csv(f'C:/Users/chyav/Uni/AAI-510/AAI-510-Project/Datasets/{dataset}')
    print(f"Loaded {dataset} into variable '{var_name}'")


Loaded processed_french_dataset.csv into variable 'french_dataset'
Loaded processed_hindi_dataset.csv into variable 'hindi_dataset'
Loaded processed_nepali_dataset.csv into variable 'nepali_dataset'
Loaded processed_nyka_dataset.csv into variable 'nyka_dataset'
Loaded processed_portuguese_dataset.csv into variable 'portuguese_dataset'
Loaded processed_tweet_dataset.csv into variable 'tweet_dataset'
Loaded processed_twitter_dataset.csv into variable 'twitter_dataset'


In [141]:
# Combine all datasets into a single dataframe
combined_dataset = pd.concat([
    french_dataset,
    hindi_dataset,
    nepali_dataset,
    nyka_dataset,
    portuguese_dataset,
    tweet_dataset,
    twitter_dataset
], ignore_index=True)

In [142]:
# Display basic information about the combined dataset
print("Shape of combined dataset:", combined_dataset.shape)
print("\nLabel distribution:")
print(combined_dataset['label'].value_counts())

Shape of combined dataset: (5036633, 2)

Label distribution:
label
0    2437278
1    2264566
2     334789
Name: count, dtype: int64


In [143]:
combined_dataset.isna().sum()

label    0
text     0
dtype: int64

In [144]:
combined_dataset.head()

Unnamed: 0,label,text
0,0,"- Awww, c'est un bummer. Tu devrais avoir davi..."
1,0,Est contrarié qu'il ne puisse pas mettre à jou...
2,0,J'ai plongé plusieurs fois pour la balle. A ré...
3,0,Tout mon corps a des démangeaisons et comme si...
4,0,"Non, il ne se comporte pas du tout. je suis en..."


In [154]:
# Randomize the dataset arrangement
combined_dataset = combined_dataset.sample(frac=1, random_state=42).reset_index(drop=True)
combined_dataset.head()

Unnamed: 0,label,text
0,1,Se não querem o Varandas como Presidente caso ...
1,0,Le produit de Sony ne semble pas presque aussi...
2,1,Se alguém falar mal de ti pelas costas ..... P...
3,0,@HatanoSayuri Me obriga &gt;:(
4,1,@mana_eliana Oi? Estou à espera :)


In [155]:
# Save the combined dataset
combined_dataset.to_csv('C:/Users/chyav/Uni/AAI-510/AAI-510-Project/Datasets/combined_dataset.csv', index=False)