# NLP

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import nltk

In [3]:
par = """
I'm ChatGPT, a state-of-the-art language model developed by OpenAI. I'm based on the GPT-3.5 architecture, which means I'm designed to understand and generate human-like text based on the input I receive. I've been trained on a wide range of internet text up until my last knowledge update in September 2021, which includes books, articles, websites, and more.

I can assist with a variety of tasks, such as answering questions, generating text, providing explanations, offering creative writing assistance, and engaging in conversation on a wide range of topics. Whether you need help with information, writing, or simply want to chat, I'm here to assist you to the best of my abilities. Please feel free to ask me anything, and I'll do my best to provide you with a helpful response.
"""

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize  # separating sentence and work is called tokenize
from nltk.corpus import stopwords  # document
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer  

In [7]:
sentences = sent_tokenize(par)

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [8]:
sentences

["\nI'm ChatGPT, a state-of-the-art language model developed by OpenAI.",
 "I'm based on the GPT-3.5 architecture, which means I'm designed to understand and generate human-like text based on the input I receive.",
 "I've been trained on a wide range of internet text up until my last knowledge update in September 2021, which includes books, articles, websites, and more.",
 'I can assist with a variety of tasks, such as answering questions, generating text, providing explanations, offering creative writing assistance, and engaging in conversation on a wide range of topics.',
 "Whether you need help with information, writing, or simply want to chat, I'm here to assist you to the best of my abilities.",
 "Please feel free to ask me anything, and I'll do my best to provide you with a helpful response."]

In [9]:
len(sentences)

6

In [10]:
word_tokens = []

for each_sentences in sentences:
    tokenized = word_tokenize(each_sentences)
    word_tokens.append(tokenized)

In [11]:
word_tokens

[['I',
  "'m",
  'ChatGPT',
  ',',
  'a',
  'state-of-the-art',
  'language',
  'model',
  'developed',
  'by',
  'OpenAI',
  '.'],
 ['I',
  "'m",
  'based',
  'on',
  'the',
  'GPT-3.5',
  'architecture',
  ',',
  'which',
  'means',
  'I',
  "'m",
  'designed',
  'to',
  'understand',
  'and',
  'generate',
  'human-like',
  'text',
  'based',
  'on',
  'the',
  'input',
  'I',
  'receive',
  '.'],
 ['I',
  "'ve",
  'been',
  'trained',
  'on',
  'a',
  'wide',
  'range',
  'of',
  'internet',
  'text',
  'up',
  'until',
  'my',
  'last',
  'knowledge',
  'update',
  'in',
  'September',
  '2021',
  ',',
  'which',
  'includes',
  'books',
  ',',
  'articles',
  ',',
  'websites',
  ',',
  'and',
  'more',
  '.'],
 ['I',
  'can',
  'assist',
  'with',
  'a',
  'variety',
  'of',
  'tasks',
  ',',
  'such',
  'as',
  'answering',
  'questions',
  ',',
  'generating',
  'text',
  ',',
  'providing',
  'explanations',
  ',',
  'offering',
  'creative',
  'writing',
  'assistance',
  ',

In [14]:
stop_words = stopwords.words('english')

In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [16]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [17]:
common_punctuations = [',', '.', '!', ':', ';', '?', "", "'", '(',')', '``', '""', "''"]
common_punctuations

[',', '.', '!', ':', ';', '?', '', "'", '(', ')', '``', '""', "''"]

In [19]:
word_cleaned = []
for each_sentence in word_tokens:
    for token_word in each_sentence:
        token_word = token_word.lower()
        if (token_word not in stop_words) and (token_word not in common_punctuations):
            word_cleaned.append(token_word)

In [20]:
word_cleaned

["'m",
 'chatgpt',
 'state-of-the-art',
 'language',
 'model',
 'developed',
 'openai',
 "'m",
 'based',
 'gpt-3.5',
 'architecture',
 'means',
 "'m",
 'designed',
 'understand',
 'generate',
 'human-like',
 'text',
 'based',
 'input',
 'receive',
 "'ve",
 'trained',
 'wide',
 'range',
 'internet',
 'text',
 'last',
 'knowledge',
 'update',
 'september',
 '2021',
 'includes',
 'books',
 'articles',
 'websites',
 'assist',
 'variety',
 'tasks',
 'answering',
 'questions',
 'generating',
 'text',
 'providing',
 'explanations',
 'offering',
 'creative',
 'writing',
 'assistance',
 'engaging',
 'conversation',
 'wide',
 'range',
 'topics',
 'whether',
 'need',
 'help',
 'information',
 'writing',
 'simply',
 'want',
 'chat',
 "'m",
 'assist',
 'best',
 'abilities',
 'please',
 'feel',
 'free',
 'ask',
 'anything',
 "'ll",
 'best',
 'provide',
 'helpful',
 'response']

In [21]:
word_cleaned = list(set(word_cleaned))

In [22]:
porter_steammer = PorterStemmer()
lancaster_steammer = LancasterStemmer()
snowball_steammer = SnowballStemmer(language= 'english')

In [27]:
steammed_tokens = []
for each_word in word_cleaned:
    steammed_tokens.append(lancaster_steammer.stem(each_word))

In [28]:
steammed_tokens

['train',
 'knowledg',
 'anyth',
 'chatgpt',
 'want',
 'wheth',
 'text',
 'fre',
 'assist',
 'simply',
 "'ll",
 'langu',
 'nee',
 'feel',
 'bas',
 'septemb',
 'develop',
 'book',
 "'m",
 'internet',
 'assist',
 'best',
 'human-like',
 'wid',
 'help',
 'gpt-3.5',
 'rang',
 'help',
 'last',
 'off',
 'task',
 'provid',
 'inform',
 'pleas',
 'expl',
 'gen',
 'abl',
 'gen',
 'convers',
 'writ',
 'ask',
 'model',
 'respons',
 'vary',
 'artic',
 'understand',
 'architect',
 'includ',
 'answ',
 'websit',
 "'ve",
 'state-of-the-art',
 'upd',
 'cre',
 'mean',
 'chat',
 'input',
 '2021',
 'quest',
 'top',
 'design',
 'provid',
 'eng',
 'opena',
 'receiv']

In [29]:
from nltk.tag import pos_tag #parts of speech

In [32]:
tagged_tokens = pos_tag(word_cleaned)

In [31]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [33]:
tagged_tokens

[('trained', 'VBN'),
 ('knowledge', 'NN'),
 ('anything', 'NN'),
 ('chatgpt', 'NN'),
 ('want', 'VBP'),
 ('whether', 'IN'),
 ('text', 'JJ'),
 ('free', 'JJ'),
 ('assist', 'NN'),
 ('simply', 'RB'),
 ("'ll", 'MD'),
 ('language', 'NN'),
 ('need', 'VB'),
 ('feel', 'NNS'),
 ('based', 'VBN'),
 ('september', 'NN'),
 ('developed', 'VBD'),
 ('books', 'NNS'),
 ("'m", 'VBP'),
 ('internet', 'JJ'),
 ('assistance', 'NN'),
 ('best', 'JJS'),
 ('human-like', 'JJ'),
 ('wide', 'JJ'),
 ('helpful', 'JJ'),
 ('gpt-3.5', 'JJ'),
 ('range', 'NN'),
 ('help', 'NN'),
 ('last', 'JJ'),
 ('offering', 'NN'),
 ('tasks', 'NNS'),
 ('provide', 'VBP'),
 ('information', 'NN'),
 ('please', 'NN'),
 ('explanations', 'NNS'),
 ('generating', 'VBG'),
 ('abilities', 'NNS'),
 ('generate', 'VBP'),
 ('conversation', 'NN'),
 ('writing', 'VBG'),
 ('ask', 'JJ'),
 ('model', 'NN'),
 ('response', 'NN'),
 ('variety', 'NN'),
 ('articles', 'NNS'),
 ('understand', 'VBP'),
 ('architecture', 'NN'),
 ('includes', 'VBZ'),
 ('answering', 'VBG'),
 ('we