In [1]:
import nltk

In [2]:
my_review = "Serena’s brilliance and achievements and her legacy as the greatest tennis player ever is not, in my opinion, simply about her tournament record. Her legacy as the greatest is due to how she has, along with Venus, transformed the sport of tennis. Since they started playing professionally, more young African American girls and boys have picked up a tennis racquet. Children of colour would line up to watch the sisters play at tournaments and wait to get autographs after the game. Tennis courts sprang up in Harlem, New York — a bastion of African American life, culture and heritage. Naomi Osaka, the former world number one, who herself is bi-racial with a Haitian father and a Japanese mother, often points to Serena as her childhood tennis idol."

In [3]:
my_review

'Serena’s brilliance and achievements and her legacy as the greatest tennis player ever is not, in my opinion, simply about her tournament record. Her legacy as the greatest is due to how she has, along with Venus, transformed the sport of tennis. Since they started playing professionally, more young African American girls and boys have picked up a tennis racquet. Children of colour would line up to watch the sisters play at tournaments and wait to get autographs after the game. Tennis courts sprang up in Harlem, New York — a bastion of African American life, culture and heritage. Naomi Osaka, the former world number one, who herself is bi-racial with a Haitian father and a Japanese mother, often points to Serena as her childhood tennis idol.'

# Data Cleaning 

## Stemming 

In [4]:
from nltk.stem import PorterStemmer

In [5]:
from nltk.corpus import stopwords

In [7]:
sentence = nltk.sent_tokenize(my_review)
sentence

['Serena’s brilliance and achievements and her legacy as the greatest tennis player ever is not, in my opinion, simply about her tournament record.',
 'Her legacy as the greatest is due to how she has, along with Venus, transformed the sport of tennis.',
 'Since they started playing professionally, more young African American girls and boys have picked up a tennis racquet.',
 'Children of colour would line up to watch the sisters play at tournaments and wait to get autographs after the game.',
 'Tennis courts sprang up in Harlem, New York — a bastion of African American life, culture and heritage.',
 'Naomi Osaka, the former world number one, who herself is bi-racial with a Haitian father and a Japanese mother, often points to Serena as her childhood tennis idol.']

In [10]:
stemmer = PorterStemmer()

In [13]:
for i in range(len(sentence)):
    words = nltk.word_tokenize(sentence[i])
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentence[i] = ' '.join(words)

In [14]:
sentence

['serena ’ brillianc achiev legaci greatest tenni player ever , opinion , simpli tournament record .',
 'her legaci greatest due , along venu , transform sport tenni .',
 'sinc start play profession , young african american girl boy pick tenni racquet .',
 'children colour would line watch sister play tournament wait get autograph game .',
 'tenni court sprang harlem , new york — bastion african american life , cultur heritag .',
 'naomi osaka , former world number one , bi-raci haitian father japanes mother , often point serena childhood tenni idol .']

## Lemmitization 

In [15]:
from nltk.stem import WordNetLemmatizer

In [16]:
sentence1 = nltk.sent_tokenize(my_review)
lemmatizer = WordNetLemmatizer()

In [20]:
for i in range(len(sentence1)):
    words = nltk.word_tokenize(sentence1[i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sentence1[i] = ' '.join(words)

In [21]:
sentence1

['Serena ’ brilliance achievement legacy greatest tennis player ever , opinion , simply tournament record .',
 'Her legacy greatest due , along Venus , transformed sport tennis .',
 'Since started playing professionally , young African American girl boy picked tennis racquet .',
 'Children colour would line watch sister play tournament wait get autograph game .',
 'Tennis court sprang Harlem , New York — bastion African American life , culture heritage .',
 'Naomi Osaka , former world number one , bi-racial Haitian father Japanese mother , often point Serena childhood tennis idol .']

## Bag of Words

In [22]:
import re

In [25]:
ps =PorterStemmer()
wordnet = WordNetLemmatizer()
sentences = nltk.sent_tokenize(my_review)
corpus=[]

for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]',' ',sentences[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [26]:
corpus

['serena brillianc achiev legaci greatest tenni player ever opinion simpli tournament record',
 'legaci greatest due along venu transform sport tenni',
 'sinc start play profession young african american girl boy pick tenni racquet',
 'children colour would line watch sister play tournament wait get autograph game',
 'tenni court sprang harlem new york bastion african american life cultur heritag',
 'naomi osaka former world number one bi racial haitian father japanes mother often point serena childhood tenni idol']

## Model Building -  Bag of Words

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500)
x = cv.fit_transform(corpus).toarray()

In [32]:
x

array([[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
        1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,