# Working with pdf file in Python

In [1]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-2.12.1-py3-none-any.whl (222 kB)
     ------------------------------------ 222.8/222.8 kB 715.8 kB/s eta 0:00:00
Installing collected packages: PyPDF2
Successfully installed PyPDF2-2.12.1


In [3]:
import PyPDF2 as pdf

In [14]:
# creating a pdf file object
data = open('TextBlob.pdf','rb')

#creating a pdf reader object
reader = pdf.PdfFileReader(data)

#Printing number of pages in pdf file
print(reader.numPages)

16


In [19]:
# creating a page object
page = reader.getPage(12)

In [20]:
# extracting the text from page
print(page.extractText())

# closing the pdf file object
data.close()

Textblob Sentiment Analysis
Proprietary content. ©Great Learning. All Rights Reserved. Unauthorized use or distribution prohibited


# Working with text file in Python

In [22]:
text_data = open('TestingFile.txt','r')
print(text_data.read())

This text file is for testing purpose only.


# Working with CSV File

In [23]:
import numpy as np
import pandas as pd

In [29]:
data = pd.read_csv("Random.csv")

# we will get the first 5 rows of csv file
data.head()

Unnamed: 0,S.No.,STATE,DISTRICT,BLOCK,MCLF NAME,MCLF CODE
0,1,ANDAMAN AND NICOBAR,NORTH AND MIDDLE ANDAMAN,DIGLIPUR,DEVI SHAKTI BLOCK LEVEL FEDERATION,AN/NO/DPUR/39400
1,2,BIHAR,,SAMASTIPUR,,


In [30]:
data.columns

Index(['S.No.', 'STATE', 'DISTRICT', 'BLOCK', 'MCLF NAME', 'MCLF CODE'], dtype='object')

In [31]:
data.shape

(2, 6)

In [32]:
data.describe()

Unnamed: 0,S.No.
count,2.0
mean,1.5
std,0.707107
min,1.0
25%,1.25
50%,1.5
75%,1.75
max,2.0


In [34]:
data.isnull().sum()

S.No.        0
STATE        0
DISTRICT     1
BLOCK        0
MCLF NAME    1
MCLF CODE    1
dtype: int64

In [36]:
data.isnull().any()

S.No.        False
STATE        False
DISTRICT      True
BLOCK        False
MCLF NAME     True
MCLF CODE     True
dtype: bool

In [37]:
data_part = data.dropna()

In [38]:
data_part

Unnamed: 0,S.No.,STATE,DISTRICT,BLOCK,MCLF NAME,MCLF CODE
0,1,ANDAMAN AND NICOBAR,NORTH AND MIDDLE ANDAMAN,DIGLIPUR,DEVI SHAKTI BLOCK LEVEL FEDERATION,AN/NO/DPUR/39400


In [39]:
data_part.isnull().sum()

S.No.        0
STATE        0
DISTRICT     0
BLOCK        0
MCLF NAME    0
MCLF CODE    0
dtype: int64

In [40]:
data_part.shape

(1, 6)

In [41]:
data.shape

(2, 6)

# Line Tokenization

In [48]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [5]:
import nltk as nl
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sharmajee\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [7]:
data = "Welcome to Great Learning!! I am very happy to be the part of this course"
tokens = nl.sent_tokenize(data)
print(tokens)

['Welcome to Great Learning!!', 'I am very happy to be the part of this course']


# Word Tokenization

In [8]:
word_tokens = nl.word_tokenize(data)
print(word_tokens)

['Welcome', 'to', 'Great', 'Learning', '!', '!', 'I', 'am', 'very', 'happy', 'to', 'be', 'the', 'part', 'of', 'this', 'course']


In [9]:
word_tokens.count()

TypeError: list.count() takes exactly one argument (0 given)

In [10]:
type(word_tokens)

list

In [11]:
len(word_tokens)

17

# Stemming

In [14]:
# import these modules
from nltk.stem import PorterStemmer
ps = PorterStemmer()

# choose some words to be stemmed
words = ['like','liking','likes']

for i in words:
    print(i, " : ", ps.stem(i))

like  :  like
liking  :  like
likes  :  like


# Lemmatization

In [17]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sharmajee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Sharmajee\AppData\Roaming\nltk_data...


True

In [18]:
from nltk.stem import WordNetLemmatizer

lemmati = WordNetLemmatizer()

print('socks:', lemmati.lemmatize("socks"))
print('sons:',lemmati.lemmatize("sons"))

socks: sock
sons: son


# Removing stop words

In [21]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sharmajee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
data = """Data science is one of the most trending field to work with. It needs data to give prediction by using the 
past scenraios"""

stop_words = set(stopwords.words("english"))
print(stopwords.words()[620:680])

['ثمنمئة', 'تسعمئة', 'مائة', 'ثلاثمائة', 'أربعمائة', 'خمسمائة', 'ستمائة', 'سبعمائة', 'ثمانمئة', 'تسعمائة', 'عشرون', 'ثلاثون', 'اربعون', 'خمسون', 'ستون', 'سبعون', 'ثمانون', 'تسعون', 'عشرين', 'ثلاثين', 'اربعين', 'خمسين', 'ستين', 'سبعين', 'ثمانين', 'تسعين', 'بضع', 'نيف', 'أجمع', 'جميع', 'عامة', 'عين', 'نفس', 'لا سيما', 'أصلا', 'أهلا', 'أيضا', 'بؤسا', 'بعدا', 'بغتة', 'تعسا', 'حقا', 'حمدا', 'خلافا', 'خاصة', 'دواليك', 'سحقا', 'سرا', 'سمعا', 'صبرا', 'صدقا', 'صراحة', 'طرا', 'عجبا', 'عيانا', 'غالبا', 'فرادى', 'فضلا', 'قاطبة', 'كثيرا']


In [28]:
data = nltk.word_tokenize(data)
data

['Data',
 'science',
 'is',
 'one',
 'of',
 'the',
 'most',
 'trending',
 'field',
 'to',
 'work',
 'with',
 '.',
 'It',
 'needs',
 'data',
 'to',
 'give',
 'prediction',
 'by',
 'using',
 'the',
 'past',
 'scenraios']

In [29]:
stops = set(stopwords.words('english'))

In [30]:
for word in data:
    if word in stops:
        print(word)

is
of
the
most
to
with
to
by
the
