In [26]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np

from arabert.preprocess import ArabertPreprocessor

import re

In [27]:
messages_df = pd.read_csv('./messages.csv')
dialect_dataset_df = pd.read_csv('./dialect_dataset.csv')

messages_df.dropna(inplace=True)
dialect_dataset_df.dropna(inplace=True)

messages_df['id'] = messages_df['id'].astype(np.int64)
dialect_dataset_df['id'] = dialect_dataset_df['id'].astype(np.int64)

messages_df

Unnamed: 0,id,sentence
0,1175358310087892992,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .
1,1175416117793349632,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...
2,1175450108898565888,@KanaanRema مبين من كلامه خليجي
3,1175471073770573824,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐
4,1175496913145217024,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺
...,...,...
458749,1022409931029458944,@jolnar121 السحله ضيفي ي بتطلع لك سحليه😅😅
458751,1057418989293485952,@mycousinvinnyys @hanyamikhail1 متهيالي دي شكو...
458752,1055620304465215616,@MahmoudWaked7 @maganenoo في طريق مطروح مركز ب...
458754,1057418989293485952,@mycousinvinnyys @hanyamikhail1 متهيالي دي شكو...


In [28]:
dialect_dataset_df

Unnamed: 0,id,dialect
0,1175358310087892992,IQ
1,1175416117793349632,IQ
2,1175450108898565888,IQ
3,1175471073770573824,IQ
4,1175496913145217024,IQ
...,...,...
458192,1019484980282580992,BH
458193,1021083283709407232,BH
458194,1017477537889431552,BH
458195,1022430374696239232,BH


In [29]:
merged_df = pd.merge(messages_df, dialect_dataset_df, on='id')

merged_df

Unnamed: 0,id,sentence,dialect
0,1175358310087892992,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .,IQ
1,1175416117793349632,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...,IQ
2,1175450108898565888,@KanaanRema مبين من كلامه خليجي,IQ
3,1175471073770573824,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐,IQ
4,1175496913145217024,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺,IQ
...,...,...,...
458196,1022409931029458944,@jolnar121 السحله ضيفي ي بتطلع لك سحليه😅😅,BH
458197,1057418989293485952,@mycousinvinnyys @hanyamikhail1 متهيالي دي شكو...,EG
458198,1055620304465215616,@MahmoudWaked7 @maganenoo في طريق مطروح مركز ب...,EG
458199,1057418989293485952,@mycousinvinnyys @hanyamikhail1 متهيالي دي شكو...,EG


In [30]:

encoder = OneHotEncoder()
encoded_dialect = encoder.fit_transform(merged_df[['dialect']])

# Create a new DataFrame with the encoded dialect column
encoded_df = pd.DataFrame(encoded_dialect.toarray(), columns=encoder.get_feature_names_out(['dialect']))

# Concatenate the encoded DataFrame with the original merged DataFrame
merged_df = pd.concat([merged_df, encoded_df], axis=1)

merged_df

Unnamed: 0,id,sentence,dialect,dialect_AE,dialect_BH,dialect_DZ,dialect_EG,dialect_IQ,dialect_JO,dialect_KW,...,dialect_LY,dialect_MA,dialect_OM,dialect_PL,dialect_QA,dialect_SA,dialect_SD,dialect_SY,dialect_TN,dialect_YE
0,1175358310087892992,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1175416117793349632,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1175450108898565888,@KanaanRema مبين من كلامه خليجي,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1175471073770573824,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1175496913145217024,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458196,1022409931029458944,@jolnar121 السحله ضيفي ي بتطلع لك سحليه😅😅,BH,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
458197,1057418989293485952,@mycousinvinnyys @hanyamikhail1 متهيالي دي شكو...,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
458198,1055620304465215616,@MahmoudWaked7 @maganenoo في طريق مطروح مركز ب...,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
458199,1057418989293485952,@mycousinvinnyys @hanyamikhail1 متهيالي دي شكو...,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:


merged_df['user'] = merged_df['sentence'].apply(lambda x: re.findall(r'@(\w+)', x)[0] if re.findall(r'@(\w+)', x) else None)
merged_df['sentence'] = merged_df['sentence'].str.split().str[1:].str.join(' ')

merged_df['sentence'] = merged_df['sentence'].apply(lambda x: re.sub(r'@\w+|http\S+', '', x))
merged_df['sentence'] = merged_df['sentence'].apply(lambda x: re.sub(r'[a-zA-Z]', '', x))


merged_df

Unnamed: 0,id,sentence,dialect,dialect_AE,dialect_BH,dialect_DZ,dialect_EG,dialect_IQ,dialect_JO,dialect_KW,...,dialect_MA,dialect_OM,dialect_PL,dialect_QA,dialect_SA,dialect_SD,dialect_SY,dialect_TN,dialect_YE,user
0,1175358310087892992,لكن بالنهاية .. ينتفض .. يغير .,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Nw8ieJUwaCAAreT
1,1175416117793349632,يعني هذا محسوب على البشر .. حيونه ووحشيه .. وت...,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7zNqXP0yrODdRjK
2,1175450108898565888,مبين من كلامه خليجي,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,KanaanRema
3,1175471073770573824,يسلملي مرورك وروحك الحلوه💐,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HAIDER76128900
4,1175496913145217024,وين هل الغيبه اخ محمد 🌸🌺,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,hmo2406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458196,1022409931029458944,السحله ضيفي ي بتطلع لك سحليه😅😅,BH,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,jolnar121
458197,1057418989293485952,متهيالي دي شكولاته الهالوين فين المحل ده,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,mycousinvinnyys
458198,1055620304465215616,في طريق مطروح مركز بهيج والمركز الي الي جمبه ...,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MahmoudWaked7
458199,1057418989293485952,متهيالي دي شكولاته الهالوين فين المحل ده,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,mycousinvinnyys


In [32]:


model_name="aubmindlab/bert-base-arabertv02-twitter"
arabert_prep = ArabertPreprocessor(model_name=model_name)

merged_df['sentence'] = merged_df['sentence'].apply(arabert_prep.preprocess)
merged_df


Unnamed: 0,id,sentence,dialect,dialect_AE,dialect_BH,dialect_DZ,dialect_EG,dialect_IQ,dialect_JO,dialect_KW,...,dialect_MA,dialect_OM,dialect_PL,dialect_QA,dialect_SA,dialect_SD,dialect_SY,dialect_TN,dialect_YE,user
0,1175358310087892992,لكن بالنهاية . . ينتفض . . يغير .,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Nw8ieJUwaCAAreT
1,1175416117793349632,يعني هذا محسوب على البشر . . حيونه ووحشيه . . ...,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7zNqXP0yrODdRjK
2,1175450108898565888,مبين من كلامه خليجي,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,KanaanRema
3,1175471073770573824,يسلملي مرورك وروحك الحلوه 💐,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HAIDER76128900
4,1175496913145217024,وين هل الغيبه اخ محمد 🌸 🌺,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,hmo2406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458196,1022409931029458944,السحله ضيفي ي بتطلع لك سحليه 😅 😅,BH,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,jolnar121
458197,1057418989293485952,متهيالي دي شكولاته الهالوين فين المحل ده,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,mycousinvinnyys
458198,1055620304465215616,في طريق مطروح مركز بهيج والمركز الي الي جمبه ا...,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MahmoudWaked7
458199,1057418989293485952,متهيالي دي شكولاته الهالوين فين المحل ده,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,mycousinvinnyys


In [39]:
merged_df.dropna(subset=['sentence'], inplace=True)
merged_df['sentence'].isna().sum()


0

In [40]:
region_mapping = {'AE': "GULF",
       'BH': "GULF", 
       'KW': "GULF", 
       'OM':"GULF",
       'QA':"GULF", 
       'SA':"GULF",
       'YE':"GULF",
       'SD':"NILE BASIN", 
       'EG': "NILE BASIN", 
       'IQ': "LEVANT", 
       'JO': "LEVANT",
       'LB':"LEVANT", 
       'PL':"LEVANT", 
       'SY':"LEVANT",
       'DZ': "MAGHREB", 
       'LY':"MAGHREB", 
       'MA':"MAGHREB", 
       'TN':"MAGHREB", 
}

merged_df['region'] = merged_df['dialect'].map(region_mapping)

In [41]:
column_order = ['id', 'user', 'sentence', 'region', 'dialect']
remaining_columns = [col for col in merged_df.columns if col not in column_order]
new_column_order = column_order + remaining_columns

merged_df = merged_df.reindex(columns=new_column_order)
merged_df

Unnamed: 0,id,user,sentence,region,dialect,dialect_AE,dialect_BH,dialect_DZ,dialect_EG,dialect_IQ,...,dialect_QA,dialect_SA,dialect_SD,dialect_SY,dialect_TN,dialect_YE,region_GULF,region_LEVANT,region_MAGHREB,region_NILE BASIN
0,1175358310087892992,Nw8ieJUwaCAAreT,لكن بالنهاية . . ينتفض . . يغير .,LEVANT,IQ,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1175416117793349632,7zNqXP0yrODdRjK,يعني هذا محسوب على البشر . . حيونه ووحشيه . . ...,LEVANT,IQ,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1175450108898565888,KanaanRema,مبين من كلامه خليجي,LEVANT,IQ,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1175471073770573824,HAIDER76128900,يسلملي مرورك وروحك الحلوه 💐,LEVANT,IQ,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1175496913145217024,hmo2406,وين هل الغيبه اخ محمد 🌸 🌺,LEVANT,IQ,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458196,1022409931029458944,jolnar121,السحله ضيفي ي بتطلع لك سحليه 😅 😅,GULF,BH,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
458197,1057418989293485952,mycousinvinnyys,متهيالي دي شكولاته الهالوين فين المحل ده,NILE BASIN,EG,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
458198,1055620304465215616,MahmoudWaked7,في طريق مطروح مركز بهيج والمركز الي الي جمبه ا...,NILE BASIN,EG,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
458199,1057418989293485952,mycousinvinnyys,متهيالي دي شكولاته الهالوين فين المحل ده,NILE BASIN,EG,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [42]:
num_unique_users = merged_df['user'].nunique()
print("Number of unique users:", num_unique_users)

num_dialects = merged_df['dialect'].nunique()
print("Number of dialects:", num_dialects)

num_regions = merged_df['region'].nunique()
print("Number of regions:", num_regions)

Number of unique users: 90110
Number of dialects: 18
Number of regions: 4


In [43]:
encoded_dialect = encoder.fit_transform(merged_df[['region']])

# Create a new DataFrame with the encoded dialect column
encoded_df = pd.DataFrame(encoded_dialect.toarray(), columns=encoder.get_feature_names_out(['region']))

# Concatenate the encoded DataFrame with the original merged DataFrame
merged_df = pd.concat([merged_df, encoded_df], axis=1)
merged_df


Unnamed: 0,id,user,sentence,region,dialect,dialect_AE,dialect_BH,dialect_DZ,dialect_EG,dialect_IQ,...,dialect_TN,dialect_YE,region_GULF,region_LEVANT,region_MAGHREB,region_NILE BASIN,region_GULF.1,region_LEVANT.1,region_MAGHREB.1,region_NILE BASIN.1
0,1175358310087892992,Nw8ieJUwaCAAreT,لكن بالنهاية . . ينتفض . . يغير .,LEVANT,IQ,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1175416117793349632,7zNqXP0yrODdRjK,يعني هذا محسوب على البشر . . حيونه ووحشيه . . ...,LEVANT,IQ,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1175450108898565888,KanaanRema,مبين من كلامه خليجي,LEVANT,IQ,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1175471073770573824,HAIDER76128900,يسلملي مرورك وروحك الحلوه 💐,LEVANT,IQ,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1175496913145217024,hmo2406,وين هل الغيبه اخ محمد 🌸 🌺,LEVANT,IQ,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458196,1022409931029458944,jolnar121,السحله ضيفي ي بتطلع لك سحليه 😅 😅,GULF,BH,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
458197,1057418989293485952,mycousinvinnyys,متهيالي دي شكولاته الهالوين فين المحل ده,NILE BASIN,EG,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
458198,1055620304465215616,MahmoudWaked7,في طريق مطروح مركز بهيج والمركز الي الي جمبه ا...,NILE BASIN,EG,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
458199,1057418989293485952,mycousinvinnyys,متهيالي دي شكولاته الهالوين فين المحل ده,NILE BASIN,EG,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [44]:
merged_df.to_csv('./arabic_dialect.csv', index=False)
