<a href="https://colab.research.google.com/github/Skander28/Models/blob/main/DialectData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
import pandas as pd
fi_df = pd.read_csv('/content/drive/MyDrive/train_labeled.tsv', sep='\t')

In [27]:
tweets_mag = fi_df[fi_df['#3 country_label'].isin(['Tunisia','Algeria','Morocco','Libya','Mauritania'])]

In [28]:
tweets_mag['#3 country_label'].value_counts()

Algeria       1491
Morocco       1070
Libya         1070
Tunisia        750
Mauritania     210
Name: #3 country_label, dtype: int64

In [29]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(tweets_mag.drop('#3 country_label', axis=1), tweets_mag['#3 country_label'])
tweets_mag = pd.concat([X_resampled, y_resampled], axis=1)

In [30]:
tweets_mag['#3 country_label'].value_counts()

Morocco       1491
Libya         1491
Mauritania    1491
Algeria       1491
Tunisia       1491
Name: #3 country_label, dtype: int64

In [31]:
import pandas as pd
tweets_ids = pd.read_csv('/content/drive/MyDrive/maghrebi/dialect_dataset.csv')
     
# Read the given dataset which contain the Ids and the labels
df_clean = pd.read_csv('/content/drive/MyDrive/maghrebi/messages.csv',
                 lineterminator='\n')

column_names = ['id', 'tweets']  # list of column names

df_clean.columns = column_names

In [32]:
tweets_dataset = pd.merge(tweets_ids, df_clean, on='id', how='inner')

In [33]:
filtered_df = tweets_dataset[tweets_dataset['dialect'].isin(['TN','DZ','MA','LY'])]

In [34]:
filtered_df['dialect'].value_counts()

LY    36499
DZ    16183
MA    11539
TN     9246
Name: dialect, dtype: int64

In [35]:
filtered_df['dialect'].replace({'LY': 'Libya', 'DZ': 'Algeria', 'MA': 'Morocco', 'TN': 'Tunisia'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['dialect'].replace({'LY': 'Libya', 'DZ': 'Algeria', 'MA': 'Morocco', 'TN': 'Tunisia'}, inplace=True)


In [36]:
tweets_maghrebi = tweets_mag.rename(columns={'#1 tweet_ID': 'id', '#2 tweet_content': 'tweets', '#3 country_label': 'dialect'})
tweets_maghrebi.drop('#4 province_label', axis=1, inplace=True)

In [37]:
dialect_dfs = {}

# Loop over unique dialects and add rows to corresponding dialect DataFrame
for dialect in filtered_df['dialect'].unique():
    dialect_df = filtered_df[filtered_df['dialect'] == dialect]
    new_dialect_df = tweets_maghrebi[tweets_maghrebi['dialect'] == dialect]
    dialect_df = pd.concat([dialect_df, new_dialect_df])
    dialect_dfs[dialect] = dialect_df

# Add a new dialect to the dictionary of dialect DataFrames
new_dialect_df = tweets_maghrebi[tweets_maghrebi['dialect'] == 'Mauritania']
dialect_dfs['Mauritania'] = new_dialect_df

# Concatenate all dialect DataFrames into a single DataFrame
df = pd.concat(dialect_dfs.values())

# Optional: reset the index of the final DataFrame
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,id,dialect,tweets
0,1009754958479151232,Libya,@toha_Altomy @gy_yah قليلين ادب ومنافقين. لو ا...
1,1009794751548313600,Libya,@AlmFaisal 😂😂 الليبيين متقلبين!!!\nبس بالنسبة ...
2,1019989115490787200,Libya,@smsm071990 @ALMOGRBE كل 20 تانيه شاب ليبي بير...
3,1035479791758135168,Libya,@AboryPro @lyranoo85 رانيا عقليتك متخلفة. اولا...
4,1035481122921164800,Libya,@lyranoo85 شكلك متعقدة علشان الراجل لي تحبيه ا...


In [38]:
df["dialect"].value_counts()

Libya         37990
Algeria       17674
Morocco       13030
Tunisia       10737
Mauritania     1491
Name: dialect, dtype: int64

In [39]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df.drop('dialect', axis=1), df['dialect'])
df = pd.concat([X_resampled, y_resampled], axis=1)
df["dialect"].value_counts()

Libya         37990
Tunisia       37990
Morocco       37990
Algeria       37990
Mauritania    37990
Name: dialect, dtype: int64

In [40]:
import nltk 
from nltk.corpus import stopwords
     
nltk.download('stopwords')
stop=stopwords.words('arabic')

df['tweets'] = df['tweets'].apply(lambda x:" ".join(x for x in x.split() if x not in stop))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
df['tweets'] = df['tweets'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in x.split()]))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [42]:
#save after preprocessing the tweets
filtered_df.to_csv('MA_DialectData.csv')  
     
#copy the csv file to google drive
!cp MA_DialectData.csv /content/drive/MyDrive/
     