In [55]:
import pandas as pd
import os

In [56]:
# List of city names
Levantine_cities = ['Aleppo','Damascus', 'Beirut','Jerusalem','Amman','Salt']
Egyptian_cities=['Alexandria','Aswan','Cairo']
Gulf_cities=['Doha', 'Jeddah', 'Riyadh', 'Muscat']
Maghrebi_cities=['Algiers','Fes', 'Rabat']
Iraqi_ctities=['Baghdad','Basra', 'Mosul']

In [57]:
def df_creations(cities, N, list_name):
    # List to store DataFrames for each city
    df_list = []
    
    for city in cities:
        # Build the file path
        file_path = f'../MADAR_Corpus/MADAR.corpus.{city}.tsv'
        
        # Check if the file exists
        if os.path.exists(file_path):
            # Read the TSV file into a DataFrame
            df = pd.read_csv(file_path, sep='\t')
            
            # Randomly sample N rows
            df_sampled = df.sample(n=N, random_state=42)  # Set random_state for reproducibility

            df_sampled['label'] = list_name
            
            # Append the sampled DataFrame to the list
            df_list.append(df_sampled)
        else:
            print(f"File not found: {file_path}")

    # Concatenate all the DataFrames into a single DataFrame
    combined_df = pd.concat(df_list, ignore_index=True)

    # Shuffle the combined DataFrame
    combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Assign the concatenated DataFrame to a global variable with the list name
    globals()[list_name] = combined_df


In [58]:
# Call the function and pass the list name as a string
df_creations(Levantine_cities, 167, 'Levantine')
df_creations(Egyptian_cities, 334, 'Egyptian')
df_creations(Gulf_cities, 250, 'Gulf')
df_creations(Maghrebi_cities, 334, 'Maghrebi')
df_creations(Iraqi_ctities, 334, 'Iraqi')


In [59]:
# Concatenate all DataFrames into a single DataFrame
all_regions_df = pd.concat([Levantine, Egyptian, Gulf,Maghrebi, Iraqi], ignore_index=True)
# Shuffle the concatenated DataFrame
all_regions_df = all_regions_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the first few rows of the final DataFrame
print(all_regions_df.head())
print(all_regions_df['label'].value_counts())  # Check the distribution by region


   sentID.BTEC                          split lang  \
0         7671  corpus-6-test-corpus-26-train  MUS   
1         2118                 corpus-6-train  RAB   
2         9862  corpus-6-test-corpus-26-train  MOS   
3        11549                 corpus-6-train  CAI   
4        15073   corpus-6-test-corpus-26-test  ALG   

                                                sent     label  
0                        خذني إلى طريق ابي، من فضلك.      Gulf  
1                     كا نتمنى هاد الرحلة دوز مزيان.  Maghrebi  
2                   الطريق للنجاح هو "جنمياكو، " صح؟     Iraqi  
3                             ممكن تتكلم بشويش شوبه؟  Egyptian  
4  شحال سعر التوصيلة؟ تقدر تكتبلي سعر التوصيلة عل...  Maghrebi  
label
Maghrebi     1002
Iraqi        1002
Egyptian     1002
Levantine    1002
Gulf         1000
Name: count, dtype: int64


In [63]:
all_regions_df.to_csv('Arabic_dialects.csv', index=False,encoding='utf-8-sig')

In [61]:
# Now, you can access the combined DataFrame using the name of the list
print(Levantine.head())
Levantine.info()

   sentID.BTEC                          split lang  \
0         6866    corpus-6-test-corpus-26-dev  JER   
1         3859  corpus-6-test-corpus-26-train  AMM   
2         1321  corpus-6-test-corpus-26-train  AMM   
3         8551  corpus-6-test-corpus-26-train  JER   
4        19507    corpus-6-test-corpus-26-dev  BEI   

                                                sent      label  
0  هدا بيجي مع صحن سلطة وانت بتنقي بطاطا مقلية أو...  Levantine  
1                            شو نوع الخبز اللي عندك؟  Levantine  
2  ممكن نحجزلك غرفة مريحة في قصر قديم قريب من ناد...  Levantine  
3                                       قديش الورقة؟  Levantine  
4                                    شو رأم الصندوء؟  Levantine  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sentID.BTEC  1002 non-null   int64 
 1   split        1002 non-null   object
 2   lang