In [1]:
import numpy as np
import pandas as pd
import requests

In [2]:
Name = []
Description = []
for i in range(1, 487):
    data = requests.get(f"https://api.themoviedb.org/3/movie/top_rated?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US&page={i}")
    data = data.json()['results']
    for movie in data:
        Name.append(movie['title'])
        Description.append(movie['overview'])

In [3]:
df = pd.DataFrame({
    'Name': Name,
    'Description': Description
})
df.head()

Unnamed: 0,Name,Description
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
2,The Godfather Part II,In the continuing saga of the Corleone crime f...
3,Schindler's List,The true story of how businessman Oskar Schind...
4,12 Angry Men,The defense and the prosecution have rested an...


## Text Processing

## Lower Casing

In [4]:
df['Description'] = df['Description'].str.lower()
df.head()

Unnamed: 0,Name,Description
0,The Shawshank Redemption,imprisoned in the 1940s for the double murder ...
1,The Godfather,"spanning the years 1945 to 1955, a chronicle o..."
2,The Godfather Part II,in the continuing saga of the corleone crime f...
3,Schindler's List,the true story of how businessman oskar schind...
4,12 Angry Men,the defense and the prosecution have rested an...


## Removing HTML Tags

In [5]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)
df['Description'] = df['Description'].apply(remove_html_tags)

In [6]:
df.sample(5)

Unnamed: 0,Name,Description
6652,The Losers,"on a mission deep in the bolivian jungle, a te..."
8262,3000 Miles to Graceland,it was an ingenious enough plan: rob the rivie...
7569,Whatsoeverly,corrupt and sleazy entrepreneur cetto la qualu...
4424,Retribution,"while carlos, a banking executive, takes his t..."
6297,Errementari: The Blacksmith and the Devil,"basque country, spain, 1843. a police constabl..."


## Removing Links

In [7]:
def removing_links(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    # This pattern is useful to filterout all types of links
    '''
    text1 = 'Check out my notebook https://www.kaggle.com/campusx/notebook8223fc1abb'
    text2 = 'Check out my notebook http://www.kaggle.com/campusx/notebook8223fc1abb'
    text3 = 'Google search here www.google.com'
    text4 = 'For notebook click https://www.kaggle.com/campusx/notebook8223fc1abb to search check www.google.com'
    '''
    return pattern.sub(r'', text)
df['Description'] = df['Description'].apply(removing_links)

In [8]:
df['Description'].sample(5)

3303    a man entranced by his dreams and imagination ...
876     in the final huevos adventure, toto and his fa...
7088    hondo harrelson recruits jim street to join an...
3621    emily arrives in miami with aspirations to bec...
7952    when a group of teenagers goes on a spring bre...
Name: Description, dtype: object

## Remove Puncuations

In [9]:
import string
exclude = string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', exclude))# maketrans is a mapping function
df['Description'] = df['Description'].apply(remove_punctuation)

In [11]:
df['Description'][0]

'imprisoned in the 1940s for the double murder of his wife and her lover upstanding banker andy dufresne begins a new life at the shawshank prison where he puts his accounting skills to work for an amoral warden during his long stretch in prison dufresne comes to be admired by the other inmates  including an older prisoner named red  for his integrity and unquenchable sense of hope'

## Chat Words Treatment

In [12]:
chat_words = {
    'AFAIK': 'As Far As I Know',
    'AFK': 'Away From Keyboard',
    'ASAP': 'As Soon As Possible',
    'ATK': 'At The Keyboard',
    'ATM': 'At The Moment',
    'A3': 'Anytime, Anywhere, Anyplace',
    'BAK': 'Back At Keyboard',
    'BBL': 'Be Back Later',
    'BBS': 'Be Back Soon',
    'BFN': 'Bye For Now',
    'B4N': 'Bye For Now',
    'BRB': 'Be Right Back',
    'BRT': 'Be Right There',
    'BTW': 'By The Way',
    'B4': 'Before',
    'B4N': 'Bye For Now',
    'CU': 'See You',
    'CUL8R': 'See You Later',
    'CYA': 'See You',
    'FAQ': 'Frequently Asked Questions',
    'FC': 'Fingers Crossed',
    'FWIW': 'For What It\'s Worth',
    'FYI': 'For Your Information',
    'GAL': 'Get A Life',
    'GG': 'Good Game',
    'GN': 'Good Night',
    'GMTA': 'Great Minds Think Alike',
    'GR8': 'Great!',
    'G9': 'Genius',
    'IC': 'I See',
    'ICQ': 'I Seek you (also a chat program)',
    'ILU': 'ILU: \'I Love You',
    'IMHO': 'In My Honest/Humble Opinion',
    'IMO': 'In My Opinion',
    'IOW': 'In Other Words',
    'IRL': 'In Real Life',
    'KISS': 'Keep It Simple, Stupid',
    'LDR': 'Long Distance Relationship',
    'LMAO': 'Laugh My A.. Off',
    'LOL': 'Laughing Out Loud',
    'LTNS': 'Long Time No See',
    'L8R': 'Later',
    'MTE': 'My Thoughts Exactly',
    'M8': 'Mate',
    'NRN': 'No Reply Necessary',
    'OIC': 'Oh I See',
    'PITA': 'Pain In The A..',
    'PRT': 'Party',
    'PRW': 'Parents Are Watching',
    'QPSA?': 'Que Pasa?',
    'ROFL': 'Rolling On The Floor Laughing',
    'ROFLOL': 'Rolling On The Floor Laughing Out Loud',
    'ROTFLMAO': 'Rolling On The Floor Laughing My A.. Off',
    'SK8': 'Skate',
    'STATS': 'Your sex and age',
    'ASL': 'Age, Sex, Location',
    'THX': 'Thank You',
    'TTFN': 'Ta-Ta For Now!',
    'TTYL': 'Talk To You Later',
    'U': 'You',
    'U2': 'You Too',
    'U4E': 'Yours For Ever',
    'WB': 'Welcome Back',
    'WTF': 'What The F...',
    'WTG': 'Way To Go!',
    'WUF': 'Where Are You From?',
    'W8': 'Wait...',
    '7K': 'Sick: \'-D Laugher',
    'TFW' : 'That feeling when. TFW internet slang often goes in a caption to an image.',
    'MFW' : 'My face when',
    'MRW' : 'My reaction when',
    'IFYP' : 'I feel your pain',
    'LOL' : 'Laughing out loud',
    'TNTL' : 'Trying not to laugh',
    'JK' : 'Just kidding',
    'IDC' : 'I don’t care',
    'ILY' : 'I love you',
    'IMU' : 'I miss you',
    'ADIH' : 'Another day in hell',
    'IDC' : 'I don’t care',
    'ZZZ' : 'Sleeping, bored, tired',
    'WYWH' : 'Wish you were here',
    'TIME' : 'Tears in my eyes',
    'BAE' : 'Before anyone else',
    'FIMH' : 'Forever in my heart',
    'BSAAW' : 'Big smile and a wink',
    'BWL' : 'Bursting with laughter',
    'LMAO' : 'Laughing my a** off',
    'BFF': 'Best friends forever',
    'CSL' : 'Can’t stop laughing'
}

In [13]:
def chat_words_treatment(text):
    for word in text.split():
        if(word in chat_words):
            text = text.replace(word, chat_words[word])
    return text
df['Description'] = df['Description'].apply(chat_words_treatment)

## Spelling Correction

In [14]:
# from textblob import TextBlob
# def spelling_correction(text):
#     textBlb = TextBlob(text)
#     return textBlb.correct().string

# df['Description'] = df['Description'].apply(spelling_correction)

## Removing Stop Words

In [15]:
from nltk.corpus import stopwords

def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

df['Description'] = df['Description'].apply(remove_stopwords)

## Removing Emojis

In [16]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

remove_emoji("Loved the movie. It was 😘😘")

# df['Description'] = df['Description'].apply(remove_emoji)

'Loved the movie. It was '

## Replacing Emojis with there Emotion Text

In [17]:
import emoji
print(emoji.demojize('Python is 🔥'))

Python is :fire:


In [18]:
print(emoji.demojize('Loved the movie. It was 😘'))

Loved the movie. It was :face_blowing_a_kiss:


## Takenization

In [20]:
from nltk.tokenize import word_tokenize,sent_tokenize

word_tokenize("".join(df['Description']))

['imprisoned',
 '1940s',
 'double',
 'murder',
 'wife',
 'lover',
 'upstanding',
 'banker',
 'andy',
 'dufresne',
 'begins',
 'new',
 'life',
 'shawshank',
 'prison',
 'puts',
 'accounting',
 'skills',
 'work',
 'amoral',
 'warden',
 'long',
 'stretch',
 'prison',
 'dufresne',
 'comes',
 'admired',
 'inmates',
 'including',
 'older',
 'prisoner',
 'named',
 'red',
 'integrity',
 'unquenchable',
 'sense',
 'hopespanning',
 'years',
 '1945',
 '1955',
 'chronicle',
 'fictional',
 'italianamerican',
 'corleone',
 'crime',
 'family',
 'organized',
 'crime',
 'family',
 'patriarch',
 'vito',
 'corleone',
 'barely',
 'survives',
 'attempt',
 'life',
 'youngest',
 'son',
 'michael',
 'steps',
 'take',
 'care',
 'wouldbe',
 'killers',
 'launching',
 'campaign',
 'bloody',
 'revenge',
 'continuing',
 'saga',
 'corleone',
 'crime',
 'family',
 'young',
 'vito',
 'corleone',
 'grows',
 'sicily',
 '1910s',
 'new',
 'york',
 '1950s',
 'michael',
 'corleone',
 'attempts',
 'expand',
 'family',
 'busi

In [21]:
sent_tokenize("".join(df['Description']))



In [30]:
import spacy
  
# Creating blank language object then 
# tokenizing words of the sentence 
nlp = spacy.blank("en") 
  
doc = nlp("am am are are") 
  
for token in doc: 
    print(token) 

am
am
are
are


## Lemmatization

In [42]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download WordNet and punkt if you haven't already
nltk.download('punkt')
nltk.download('wordnet')

# Initialize the lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Sample sentence
sentence = "He was running and eating at the same time. He has a bad habit of swimming after playing long hours in the Sun."

# Tokenize the sentence into words
words = word_tokenize(sentence)

# Lemmatize each word in the sentence with 'v' as the POS tag for verbs
lemmatized_words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

# Join the lemmatized words back into a sentence
lemmatized_sentence = ' '.join(lemmatized_words)

print(lemmatized_sentence)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


LookupError: 
**********************************************************************
  Resource 'corpora/wordnet' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - '/root/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

In [41]:
!pip install nltk

