QCRI Arabic Dialect Identification (QADI)

Country level Arabic dialect identification (DI) dataset. It provides a collection for benchmarking DI task.

The dataset contains 540,590 tweets from 18 Arab countries.

The dataset files contains ids for the all the tweets identified as from the designated country .[Data](https://github.com/qcri/QADI)

Using twitter Api to get tweets text from tweets ids .


In [None]:
### install tweepy library 
!pip install  tweepy

In [23]:
### import packages
import os 
import pandas as pd 
import numpy as np 
import re
import glob
import nltk 
import string

In [5]:
## mount drive to store data in drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:

# Authenticate to Twitter
consumer_key="XXX"
consumer_secret="XXX"
access_token="XXX"
access_token_secret="XXX"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)


api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

try:
    api.verify_credentials()
    print("Authentication OK")
except:
    print("Error during authentication")

Authentication OK


In [59]:
## get tweets text from tweets ids 
def lookup_tweets(tweet_IDs, api):
    full_tweets = []
    tweet_count = len(tweet_IDs)
    try:
        for i in range((tweet_count // 100) + 1):
            # Catch the last group if it is less than 100 tweets
            end_loc = min((i + 1) * 100, tweet_count)

            full_tweets.extend(
                api.statuses_lookup(id_=tweet_IDs[i * 100:end_loc])
            )
            print(len(full_tweets))
        return full_tweets
    except tweepy.TweepError as e:
        print(e)
        return None

In [8]:
## clone repo to get data 
!git clone https://github.com/qcri/QADI.git

Cloning into 'QADI'...
remote: Enumerating objects: 48, done.[K
remote: Counting objects: 100% (48/48), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 48 (delta 7), reused 44 (delta 6), pack-reused 0[K
Unpacking objects: 100% (48/48), done.


In [41]:
### get all txt files in dataset directory
main_dir='/content/QADI/dataset'
files=os.listdir(main_dir)


In [None]:
### loop over each file to get tweets
for file in files:
  tweets_ids=[]
  countries=[]
  results_text=[]
  print(file)
  ids=open(os.path.join(main_dir,file)).readlines()
  ids=[i.strip() for i in ids]
  country=file.split('_')[-1].replace('.txt','')
  print(len(ids))
  results=lookup_tweets(ids, api)
  if results!=None:
    for result in results:
      try:
        results_text.append(result.text)
        tweets_ids.append(result.id)
      except Exception as e:
        print(exit)  
  print(len(results_text))
  countries=[country] * len(results_text)
  df=pd.DataFrame({'ids':tweets_ids,
                   'tweet':results_text,
                   'country':countries})
  file_name=file.replace('txt','csv')
  df.to_csv('/content/drive/MyDrive/Data/'+file_name,index=False,encoding='utf-8')


## Concat  all csv file in one dataframe 

In [8]:
## get all csv files 
paths=['/content/drive/MyDrive/Data',]

all_files=[]
for path in  paths:
  all_files.extend(glob.glob(path + "/*.csv"))

In [14]:
len(all_files)

18

In [15]:
## make dataframe from all csv files
li=[]
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0,lineterminator='\n')
    li.append(df)

df_total = pd.concat(li, axis=0, ignore_index=True)

In [16]:
df_total.head()

Unnamed: 0,ids,tweet,country
0,1161776961259286529,البلد العربي اللي عنده لاعب خد دوري الابطال وا...,EG
1,1163476458402197504,@NaguibSawiris \nبس مهندس ايه معرفش 😂 https:/...,EG
2,1163957440318181377,@AmrOfficial3 اتصور معاك \nحياة عيالك يا شيخ 😂,EG
3,1163757982129315841,الناس اللي بتكتب الكلام بالتشكيل والتنوين دول ...,EG
4,1163479461515448320,@MASHALLAH101 @Mr_Tamer_L جايزة اسخف كومنت شوف...,EG


In [17]:
df_total.shape

(467945, 3)

In [20]:
## get arabic stop words from nltk library 
nltk.download('stopwords')
arb_stopwords = set(nltk.corpus.stopwords.words("arabic"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ«»'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

## remove_punctuations
def remove_punctuations(text):
    return ''.join([char if char  not in punctuations_list else ' ' for char  in text])

## remove_numbers
def replace_numbers(text):
  numbers=['٩', '٨', '٧', '٦', '٥', '٤', '٣', '٢', '١','٠']
  return ''.join([char for char in text if char not in numbers ])



In [24]:
### clean tweets ,remove hashtages ,urls ,emoji ,tags 
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)

def clean(text):
    text=re.sub(r'[a-zA-Z]*',"",text)
    text = re.sub(r'[0-9]*', '', text)
    text=emoji_pattern.sub(' ', text)
    text=replace_numbers(text)
    text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text)
    text=text.replace('#',' ')
    text=text.replace('_',' ')
    text=' '.join([word for word in text.split() if word not in arb_stopwords])
    text=remove_punctuations(text)
    text=re.sub(r'[^ا-ي]+',' ',text)
    text=' '.join( [w for w in text.split() if len(w)>2] )
    text=text.replace('_',' ')
    text=re.sub('\s+',' ',text)

    return text

In [25]:
## apply clean function to dataframe
df_total['tweet_clean']=df_total['tweet'].apply(clean)

In [26]:
df_total.head(20)

Unnamed: 0,ids,tweet,country,tweet_clean
0,1161776961259286529,البلد العربي اللي عنده لاعب خد دوري الابطال وا...,EG,البلد العربي اللي عنده لاعب دوري الابطال والسو...
1,1163476458402197504,@NaguibSawiris \nبس مهندس ايه معرفش 😂 https:/...,EG,مهندس ايه معرفش
2,1163957440318181377,@AmrOfficial3 اتصور معاك \nحياة عيالك يا شيخ 😂,EG,اتصور معاك حياة عيالك شيخ
3,1163757982129315841,الناس اللي بتكتب الكلام بالتشكيل والتنوين دول ...,EG,الناس اللي بتكتب الكلام بالتشكيل والتنوين دول ...
4,1163479461515448320,@MASHALLAH101 @Mr_Tamer_L جايزة اسخف كومنت شوف...,EG,جايزة اسخف كومنت شوفته التاريخ
5,1163768863110000641,@MahmoudRebab @A__mola @Mohamadalzalek2 @Sara_...,EG,
6,1161633304539516933,@ahelmy يا جماعة دي وجهات نظر عادي، وبعدين واح...,EG,جماعة وجهات نظر عادي وبعدين واحد بيشجع الزمالك...
7,1162875848279711745,الزملكاوية اللي هاجموا الناس اللي نشرت لقطة رج...,EG,الزملكاوية اللي هاجموا الناس اللي نشرت لقطة رج...
8,1162378193896202240,@nabilelhalfawy لما بينداس ع المصري بالجزم ف ا...,EG,بينداس المصري بالجزم بلد عربي محدش بيجيبله حقه...
9,1163125682437836801,حوشوها عني هتموتني بنت الكلب 😂 https://t.co/NN...,EG,حوشوها عني هتموتني بنت الكلب


In [27]:
## drop ids ,tweet colunms
df_total.drop(['ids','tweet'],axis=1,inplace=True)

In [28]:
df_total.head()

Unnamed: 0,country,tweet_clean
0,EG,البلد العربي اللي عنده لاعب دوري الابطال والسو...
1,EG,مهندس ايه معرفش
2,EG,اتصور معاك حياة عيالك شيخ
3,EG,الناس اللي بتكتب الكلام بالتشكيل والتنوين دول ...
4,EG,جايزة اسخف كومنت شوفته التاريخ


In [29]:
df_total.shape

(467945, 2)

In [30]:
df_total.country.value_counts()

EG    58223
PL    43990
KW    42936
LY    35316
QA    32706
LB    30850
JO    28585
SA    27309
AE    27261
BH    26054
OM    20602
SY    16459
DZ    16111
IQ    15305
SD    14786
MA    10944
TN    10391
YE    10117
Name: country, dtype: int64

In [35]:
df_total.isnull().sum()

country           0
tweet_clean    2330
dtype: int64

In [33]:

df_total['tweet_clean']=df_total['tweet_clean'].apply(lambda x :  x if len(x.split())>0  else None)

In [36]:
## drop none values
df_total.dropna(inplace=True)
df_total.isnull().sum()

country        0
tweet_clean    0
dtype: int64

In [37]:
df_total.shape

(465615, 2)

In [38]:
### save final data to drive
df_total.to_csv('/content/drive/MyDrive/Data/QADI_tweets.csv',index=False,encoding='utf-8')