Importing packages and setting the working directory to combine data files

In [1]:
import os
import glob
import pandas as pd
os.chdir(r"C:\Users\Sultan\Documents\GitHub\Budget_Text_Analysis\util\data\structured\original")

Using glob to match the pattern ‘csv’

In [2]:
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

Combining all data files

In [3]:
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
combined_csv.to_csv( "combined_original.csv", index=False, encoding='utf-8-sig')

Reading Data

In [4]:
#import pandas library
import pandas as pd 
data = pd.read_csv(r"C:\Users\Sultan\Documents\GitHub\Budget_Text_Analysis\util\data\structured\original\combined_original.csv")

Importing Natural Language Toolkit

In [5]:
import nltk

Step 1: Transform all words into lower case

In [6]:
data['word'] = data['word'].apply(lambda x: " ".join(x.lower() for x in x.split()))
data['word'].head(20)

0        ensuring
1              an
2       equitable
3     sustainable
4             and
5       resilient
6       charlotte
7         adopted
8              fy
9          budget
10             fy
11        capital
12     investment
13           plan
14       ensuring
15             an
16      equitable
17    sustainable
18            and
19      resilient
Name: word, dtype: object

Step 2: Correcting spelling

In [7]:
from textblob import TextBlob
data['word'][:20].apply(lambda x: str(TextBlob(x).correct()))

0        ensuring
1              an
2       equitable
3     sustainable
4             and
5        resident
6       charlotte
7         adopted
8              by
9          budget
10             by
11        capital
12     investment
13           plan
14       ensuring
15             an
16      equitable
17    sustainable
18            and
19       resident
Name: word, dtype: object

Step 3: Lemmatization - Converting all words into their root words

In [8]:
from textblob import Word
data['word'] = data['word'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data['word'].head(20)

0        ensuring
1              an
2       equitable
3     sustainable
4             and
5       resilient
6       charlotte
7         adopted
8              fy
9          budget
10             fy
11        capital
12     investment
13           plan
14       ensuring
15             an
16      equitable
17    sustainable
18            and
19      resilient
Name: word, dtype: object

Step 4: Counting stopwords in data

In [9]:
from nltk.corpus import stopwords #importing stopwords from nltk
stop = stopwords.words('english') #storing stopwords in variable stop

data['stopwords'] = data['word'].apply(lambda x: len([x for x in x.split() if x in stop])) 
data[['word','stopwords']].head(20)

Unnamed: 0,word,stopwords
0,ensuring,0
1,an,1
2,equitable,0
3,sustainable,0
4,and,1
5,resilient,0
6,charlotte,0
7,adopted,0
8,fy,0
9,budget,0


Step 5: Removing stop words

In [10]:
data['word'] = data['word'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
data['word'].head(20)

0        ensuring
1                
2       equitable
3     sustainable
4                
5       resilient
6       charlotte
7         adopted
8              fy
9          budget
10             fy
11        capital
12     investment
13           plan
14       ensuring
15               
16      equitable
17    sustainable
18               
19      resilient
Name: word, dtype: object

Finding term frequency, TF = (Number of times term T appears in the particular row) / (number of terms in that row)

In [21]:
tf1 = (data['word'][:20]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

Unnamed: 0,words,tf
0,ensuring,2.0
1,,4.0
2,equitable,2.0
3,sustainable,2.0
4,resilient,2.0
5,charlotte,1.0
6,adopted,1.0
7,fy,2.0
8,budget,1.0
9,capital,1.0


Inverse Document Frequency (IDF)

In [25]:
#IDF = log(N/n), where, N is the total number of rows  
#and n is the number of rows in which the word was present.

import numpy as np 
for i, word in enumerate(tf1['words']):
  tf1.loc[i, 'idf'] = np.log(data.shape[0]/(len(data[data['word'].str.contains(word)])))
tf1

Unnamed: 0,words,tf,idf
0,ensuring,2.0,8.84451
1,,4.0,0.0
2,equitable,2.0,9.062234
3,sustainable,2.0,8.656769
4,resilient,2.0,11.169074
5,charlotte,1.0,6.801034
6,adopted,1.0,5.047069
7,fy,2.0,3.887842
8,budget,1.0,4.703621
9,capital,1.0,5.531511


Term Frequency – Inverse Document Frequency (TF-IDF)

In [26]:
#TF-IDF is the multiplication of the TF and IDF which we calculated above.

tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,ensuring,2.0,8.84451,17.689021
1,,4.0,0.0,0.0
2,equitable,2.0,9.062234,18.124468
3,sustainable,2.0,8.656769,17.313537
4,resilient,2.0,11.169074,22.338149
5,charlotte,1.0,6.801034,6.801034
6,adopted,1.0,5.047069,5.047069
7,fy,2.0,3.887842,7.775683
8,budget,1.0,4.703621,4.703621
9,capital,1.0,5.531511,5.531511
