**NLTK installation on Mac/Linux:** ``pip install --user -U nltk`` (Mac/Linux)  
**NLTK installation on Windows:** Following the [installing tutorial](https://docs.python-guide.org/starting/install3/win/#install3-windows) is strongly recommended  
**Upgrade pandas:** ``pip install --upgrade pandas``



In [1]:
import pandas as pd

tweets_df = pd.read_csv('translated_tweets_t1.csv', index_col="Unnamed: 0")

In [2]:
tweets_df.head(5)

Unnamed: 0,topicID,tweetID,label,full_text_ar,full_text_en
0,CT20-AR-05,1221585936095555584,0,بين ثورة لبنان ال١٠٠ يوم وبين ٤٠ العراق \nتتر ...,Between Lebanon and the 100-day revolution bet...
1,CT20-AR-05,1221587916750753793,0,#لبنان_النا_مش_الكن\nأرملة فقيرة\nعليهاإيقاف خ...,# Of Banan_na_mh_alkn\nPoor widow\nAllehaaaaga...
2,CT20-AR-05,1221602993918894081,0,#جدار_العار\nالحيتان الحيطان..عمروا بينن و بين...,# Jaddar_ar\nWhales Alehitan..amra Bann and be...
3,CT20-AR-05,1221653116036304896,0,هل يحمي #جدار_العار السلطة السياسية الفاسدة من...,Does it protect # Jaddar_ar corrupt political ...
4,CT20-AR-05,1221663019345874944,1,موازنة شو؟!\n١- ارقام غير صحيحة.\n٢- حكومة لم ...,Balancing Shu ?!\n1. The figures are incorrect...


In [3]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/administrator/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 

# before using nltk.corpus.stopwords, following the cell above to download stopwords
stopwords = stopwords.words('english')
url_pattern = r"https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,}"
token_pattern = r"\b[A-Za-z][A-Za-z]+\b"

# function of preprocessing text, tokenization, stopwords removal, stemming
def preprocess_text(text, url_pattern = url_pattern, token_pattern=token_pattern, 
                    with_urlrm=True, with_stopwordsrm=True, stopwords=stopwords, with_stemming=False):
    # url removal 
    if with_urlrm == True:
        text = re.sub(url_pattern, "", text)
        
    # lower case 
    text_lower = text.lower()
    
    # tokenization 
    words = re.findall(token_pattern, text_lower)
    
    # stopwords removal
    if with_stopwordsrm == True:
        words = [word for word in words if word not in stopwords]
        
    # stemming 
    if with_stemming == True:
        ps = PorterStemmer() 
        words = [ps.stem(word) for word in words]
        
    text_processed = " ".join(words)
    
    return text_processed
    

In [5]:
corpus_no_stemming = []
corpus_with_stemming = []
for text in tweets_df["full_text_en"]:
    corpus_no_stemming.append(preprocess_text(text, with_stemming=False))
    corpus_with_stemming.append(preprocess_text(text, with_stemming=True))
    
tweets_df["preprocessed_text_no_stemming"] = corpus_no_stemming
tweets_df["preprocessed_text_with_stemming"] = corpus_no_stemming
tweets_df.to_csv('translated_tweets__t1_with_preprocessed_text.csv')

In [6]:
tweets_df

Unnamed: 0,topicID,tweetID,label,full_text_ar,full_text_en,preprocessed_text_no_stemming,preprocessed_text_with_stemming
0,CT20-AR-05,1221585936095555584,0,بين ثورة لبنان ال١٠٠ يوم وبين ٤٠ العراق \nتتر ...,Between Lebanon and the 100-day revolution bet...,lebanon day revolution iraq tatar rabble iran ...,lebanon day revolution iraq tatar rabble iran ...
1,CT20-AR-05,1221587916750753793,0,#لبنان_النا_مش_الكن\nأرملة فقيرة\nعليهاإيقاف خ...,# Of Banan_na_mh_alkn\nPoor widow\nAllehaaaaga...,poor widow allehaaaagaf services khllonanfrhaa...,poor widow allehaaaagaf services khllonanfrhaa...
2,CT20-AR-05,1221602993918894081,0,#جدار_العار\nالحيتان الحيطان..عمروا بينن و بين...,# Jaddar_ar\nWhales Alehitan..amra Bann and be...,whales alehitan amra bann alhab hitan,whales alehitan amra bann alhab hitan
3,CT20-AR-05,1221653116036304896,0,هل يحمي #جدار_العار السلطة السياسية الفاسدة من...,Does it protect # Jaddar_ar corrupt political ...,protect corrupt political power people rebellious,protect corrupt political power people rebellious
4,CT20-AR-05,1221663019345874944,1,موازنة شو؟!\n١- ارقام غير صحيحة.\n٢- حكومة لم ...,Balancing Shu ?!\n1. The figures are incorrect...,balancing shu figures incorrect government rec...,balancing shu figures incorrect government rec...
...,...,...,...,...,...,...,...
1495,CT20-AR-19,1232376139148152832,0,في برنامج الاتجاه المعاكس الليلة:\nهل ينفجر ال...,In the program opposite direction tonight:\nDo...,program opposite direction tonight explode sit...,program opposite direction tonight explode sit...
1496,CT20-AR-19,1232381112204193792,0,#الاتجاه_المعاكس - أليس من حق #تركيا أن تحمي أ...,# Alatjah_alamaaks - Is not it right # Turkey ...,right turkey protect national security norther...,right turkey protect national security norther...
1497,CT20-AR-19,1232395009325965317,0,ما أهمية جبل الزاوية في إدلب.. وما الذي تعنيه ...,What is the importance of Mount corner in Idli...,importance mount corner idlib mean trying russ...,importance mount corner idlib mean trying russ...
1498,CT20-AR-19,1232397768573976579,0,يرى البلدان (تركيا وروسيا)أن منطقة البحر الأسو...,See countries (Turkey and Russia) that Sea reg...,see countries turkey russia sea region black a...,see countries turkey russia sea region black a...


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_no_stemming = TfidfVectorizer()
X_no_stemming = vectorizer_no_stemming.fit_transform(corpus_no_stemming)

vectorizer_with_stemming = TfidfVectorizer()
X_with_stemming = vectorizer_with_stemming.fit_transform(corpus_with_stemming)

In [8]:
X_no_stemming.shape

(1499, 7804)

In [9]:
X_with_stemming.shape

(1499, 6115)

In [10]:
print(X_no_stemming)

  (0, 1592)	0.18570013360570378
  (0, 2061)	0.13880680090324413
  (0, 6045)	0.20428233349578653
  (0, 4083)	0.20428233349578653
  (0, 3037)	0.20428233349578653
  (0, 5580)	0.12912859353559253
  (0, 1695)	0.16113580150619053
  (0, 1069)	0.1706977030617872
  (0, 3716)	0.15211550317170444
  (0, 1469)	0.16711793371562103
  (0, 7111)	0.18570013360570378
  (0, 6413)	0.1934124433791833
  (0, 2209)	0.1439286952124784
  (0, 6405)	0.20428233349578653
  (0, 7020)	0.16113580150619053
  (0, 1487)	0.1639603533724974
  (0, 6519)	0.20428233349578653
  (0, 6705)	0.14537815348241462
  (0, 2941)	0.1706977030617872
  (0, 3250)	0.20428233349578653
  (0, 6517)	0.20428233349578653
  (0, 3883)	0.2753316874178702
  (0, 5730)	0.20428233349578653
  (0, 6966)	0.20428233349578653
  (0, 3886)	0.32227160301238106
  :	:
  (1498, 1890)	0.20348203428627512
  (1498, 3194)	0.40696406857255024
  (1498, 614)	0.20348203428627512
  (1498, 3418)	0.20348203428627512
  (1498, 7001)	0.20348203428627512
  (1498, 3333)	0.203482034

In [11]:
df_tfidf_no_stemming = pd.DataFrame.sparse.from_spmatrix(X_no_stemming, 
                                                         columns=vectorizer_no_stemming.get_feature_names())
df_tfidf_with_stemming = pd.DataFrame.sparse.from_spmatrix(X_with_stemming, 
                                                         columns=vectorizer_with_stemming.get_feature_names())

In [12]:
df_tfidf_no_stemming.head(5)

Unnamed: 0,aa,aaaaaa,aaaaaaagel,aaaier,aaars,aaarsat,aabernamha,aabyuhzha,aadae,aajabi,...,zlzlhm,zmrolha,zndqoh,zone,zones,zouk,zti,ztoot,zugheib,zuo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df_tfidf_with_stemming.head(5)

Unnamed: 0,aa,aaaaaa,aaaaaaagel,aaaier,aaar,aaarsat,aabernamha,aabyuhzha,aada,aajabi,...,zlv,zlzlhm,zmrolha,zndqoh,zone,zouk,zti,ztoot,zugheib,zuo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
df_tfidf_no_stemming.to_csv("tfidf_no_stemming.csv")
df_tfidf_with_stemming.to_csv("tfidf_with_stemming.csv")