In [12]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np


In [28]:
messages_df = pd.read_csv('./messages.csv')
dialect_dataset_df = pd.read_csv('./dialect_dataset.csv')

messages_df.dropna(inplace=True)
dialect_dataset_df.dropna(inplace=True)

messages_df['id'] = messages_df['id'].astype(np.int64)
dialect_dataset_df['id'] = dialect_dataset_df['id'].astype(np.int64)

messages_df

Unnamed: 0,id,sentence
0,1175358310087892992,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .
1,1175416117793349632,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...
2,1175450108898565888,@KanaanRema مبين من كلامه خليجي
3,1175471073770573824,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐
4,1175496913145217024,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺
...,...,...
458749,1022409931029458944,@jolnar121 السحله ضيفي ي بتطلع لك سحليه😅😅
458751,1057418989293485952,@mycousinvinnyys @hanyamikhail1 متهيالي دي شكو...
458752,1055620304465215616,@MahmoudWaked7 @maganenoo في طريق مطروح مركز ب...
458754,1057418989293485952,@mycousinvinnyys @hanyamikhail1 متهيالي دي شكو...


In [29]:
dialect_dataset_df

Unnamed: 0,id,dialect
0,1175358310087892992,IQ
1,1175416117793349632,IQ
2,1175450108898565888,IQ
3,1175471073770573824,IQ
4,1175496913145217024,IQ
...,...,...
458192,1019484980282580992,BH
458193,1021083283709407232,BH
458194,1017477537889431552,BH
458195,1022430374696239232,BH


In [30]:
merged_df = pd.merge(messages_df, dialect_dataset_df, on='id')

merged_df

Unnamed: 0,id,sentence,dialect
0,1175358310087892992,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .,IQ
1,1175416117793349632,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...,IQ
2,1175450108898565888,@KanaanRema مبين من كلامه خليجي,IQ
3,1175471073770573824,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐,IQ
4,1175496913145217024,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺,IQ
...,...,...,...
458196,1022409931029458944,@jolnar121 السحله ضيفي ي بتطلع لك سحليه😅😅,BH
458197,1057418989293485952,@mycousinvinnyys @hanyamikhail1 متهيالي دي شكو...,EG
458198,1055620304465215616,@MahmoudWaked7 @maganenoo في طريق مطروح مركز ب...,EG
458199,1057418989293485952,@mycousinvinnyys @hanyamikhail1 متهيالي دي شكو...,EG


In [31]:

encoder = OneHotEncoder()
encoded_dialect = encoder.fit_transform(merged_df[['dialect']])

# Create a new DataFrame with the encoded dialect column
encoded_df = pd.DataFrame(encoded_dialect.toarray(), columns=encoder.get_feature_names_out(['dialect']))

# Concatenate the encoded DataFrame with the original merged DataFrame
merged_df = pd.concat([merged_df, encoded_df], axis=1)

merged_df

Unnamed: 0,id,sentence,dialect,dialect_AE,dialect_BH,dialect_DZ,dialect_EG,dialect_IQ,dialect_JO,dialect_KW,...,dialect_LY,dialect_MA,dialect_OM,dialect_PL,dialect_QA,dialect_SA,dialect_SD,dialect_SY,dialect_TN,dialect_YE
0,1175358310087892992,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1175416117793349632,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1175450108898565888,@KanaanRema مبين من كلامه خليجي,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1175471073770573824,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1175496913145217024,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458196,1022409931029458944,@jolnar121 السحله ضيفي ي بتطلع لك سحليه😅😅,BH,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
458197,1057418989293485952,@mycousinvinnyys @hanyamikhail1 متهيالي دي شكو...,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
458198,1055620304465215616,@MahmoudWaked7 @maganenoo في طريق مطروح مركز ب...,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
458199,1057418989293485952,@mycousinvinnyys @hanyamikhail1 متهيالي دي شكو...,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
merged_df['user'] = merged_df['sentence'].str.split().str.get(0)
merged_df['sentence'] = merged_df['sentence'].str.split().str[1:].str.join(' ')

merged_df

Unnamed: 0,id,sentence,dialect,dialect_AE,dialect_BH,dialect_DZ,dialect_EG,dialect_IQ,dialect_JO,dialect_KW,...,dialect_MA,dialect_OM,dialect_PL,dialect_QA,dialect_SA,dialect_SD,dialect_SY,dialect_TN,dialect_YE,user
0,1175358310087892992,لكن بالنهاية .. ينتفض .. يغير .,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@Nw8ieJUwaCAAreT
1,1175416117793349632,يعني هذا محسوب على البشر .. حيونه ووحشيه .. وت...,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@7zNqXP0yrODdRjK
2,1175450108898565888,مبين من كلامه خليجي,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@KanaanRema
3,1175471073770573824,يسلملي مرورك وروحك الحلوه💐,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@HAIDER76128900
4,1175496913145217024,وين هل الغيبه اخ محمد 🌸🌺,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@hmo2406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458196,1022409931029458944,السحله ضيفي ي بتطلع لك سحليه😅😅,BH,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@jolnar121
458197,1057418989293485952,@hanyamikhail1 متهيالي دي شكولاته الهالوين فين...,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@mycousinvinnyys
458198,1055620304465215616,@maganenoo في طريق مطروح مركز بهيج والمركز الي...,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@MahmoudWaked7
458199,1057418989293485952,@hanyamikhail1 متهيالي دي شكولاته الهالوين فين...,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@mycousinvinnyys


In [33]:
import nltk
nltk.download('punkt')

merged_df['tokenized_sentence'] = merged_df['sentence'].apply(nltk.word_tokenize)

merged_df


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fahee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,id,sentence,dialect,dialect_AE,dialect_BH,dialect_DZ,dialect_EG,dialect_IQ,dialect_JO,dialect_KW,...,dialect_OM,dialect_PL,dialect_QA,dialect_SA,dialect_SD,dialect_SY,dialect_TN,dialect_YE,user,tokenized_sentence
0,1175358310087892992,لكن بالنهاية .. ينتفض .. يغير .,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@Nw8ieJUwaCAAreT,"[لكن, بالنهاية, .., ينتفض, .., يغير, .]"
1,1175416117793349632,يعني هذا محسوب على البشر .. حيونه ووحشيه .. وت...,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@7zNqXP0yrODdRjK,"[يعني, هذا, محسوب, على, البشر, .., حيونه, ووحش..."
2,1175450108898565888,مبين من كلامه خليجي,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@KanaanRema,"[مبين, من, كلامه, خليجي]"
3,1175471073770573824,يسلملي مرورك وروحك الحلوه💐,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@HAIDER76128900,"[يسلملي, مرورك, وروحك, الحلوه💐]"
4,1175496913145217024,وين هل الغيبه اخ محمد 🌸🌺,IQ,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@hmo2406,"[وين, هل, الغيبه, اخ, محمد, 🌸🌺]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458196,1022409931029458944,السحله ضيفي ي بتطلع لك سحليه😅😅,BH,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@jolnar121,"[السحله, ضيفي, ي, بتطلع, لك, سحليه😅😅]"
458197,1057418989293485952,@hanyamikhail1 متهيالي دي شكولاته الهالوين فين...,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@mycousinvinnyys,"[@, hanyamikhail1, متهيالي, دي, شكولاته, الهال..."
458198,1055620304465215616,@maganenoo في طريق مطروح مركز بهيج والمركز الي...,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@MahmoudWaked7,"[@, maganenoo, في, طريق, مطروح, مركز, بهيج, وا..."
458199,1057418989293485952,@hanyamikhail1 متهيالي دي شكولاته الهالوين فين...,EG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@mycousinvinnyys,"[@, hanyamikhail1, متهيالي, دي, شكولاته, الهال..."


In [34]:
column_order = ['id', 'user', 'sentence', 'tokenized_sentence', 'dialect']
remaining_columns = [col for col in merged_df.columns if col not in column_order]
new_column_order = column_order + remaining_columns

merged_df = merged_df.reindex(columns=new_column_order)
merged_df

Unnamed: 0,id,user,sentence,tokenized_sentence,dialect,dialect_AE,dialect_BH,dialect_DZ,dialect_EG,dialect_IQ,...,dialect_LY,dialect_MA,dialect_OM,dialect_PL,dialect_QA,dialect_SA,dialect_SD,dialect_SY,dialect_TN,dialect_YE
0,1175358310087892992,@Nw8ieJUwaCAAreT,لكن بالنهاية .. ينتفض .. يغير .,"[لكن, بالنهاية, .., ينتفض, .., يغير, .]",IQ,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1175416117793349632,@7zNqXP0yrODdRjK,يعني هذا محسوب على البشر .. حيونه ووحشيه .. وت...,"[يعني, هذا, محسوب, على, البشر, .., حيونه, ووحش...",IQ,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1175450108898565888,@KanaanRema,مبين من كلامه خليجي,"[مبين, من, كلامه, خليجي]",IQ,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1175471073770573824,@HAIDER76128900,يسلملي مرورك وروحك الحلوه💐,"[يسلملي, مرورك, وروحك, الحلوه💐]",IQ,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1175496913145217024,@hmo2406,وين هل الغيبه اخ محمد 🌸🌺,"[وين, هل, الغيبه, اخ, محمد, 🌸🌺]",IQ,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458196,1022409931029458944,@jolnar121,السحله ضيفي ي بتطلع لك سحليه😅😅,"[السحله, ضيفي, ي, بتطلع, لك, سحليه😅😅]",BH,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
458197,1057418989293485952,@mycousinvinnyys,@hanyamikhail1 متهيالي دي شكولاته الهالوين فين...,"[@, hanyamikhail1, متهيالي, دي, شكولاته, الهال...",EG,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
458198,1055620304465215616,@MahmoudWaked7,@maganenoo في طريق مطروح مركز بهيج والمركز الي...,"[@, maganenoo, في, طريق, مطروح, مركز, بهيج, وا...",EG,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
458199,1057418989293485952,@mycousinvinnyys,@hanyamikhail1 متهيالي دي شكولاته الهالوين فين...,"[@, hanyamikhail1, متهيالي, دي, شكولاته, الهال...",EG,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
num_unique_users = merged_df['user'].nunique()
print("Number of unique users:", num_unique_users)

num_dialects = merged_df['dialect'].nunique()
print("Number of dialects:", num_dialects)

Number of unique users: 134710
Number of dialects: 18


In [37]:
merged_df.to_csv('./arabic_dialect.csv', index=False)
