__Concepts learn from this case study__
-> CountVectorization
-> TFIDFVectorization

# Text classification

__Read the CSV file “Wine.csv”__

In [3]:
import numpy as np
import pandas as pd
import nltk
from nltk import word_tokenize

In [4]:
wine = pd.read_csv('Wine.csv')
wine.head()

Unnamed: 0.1,Unnamed: 0,winery,country,points,description,designation
0,0,Heitz,US,96,This tremendous 100% varietal wine hails from ...,Martha's Vineyard
1,1,Bodega Carmen Rodríguez,Spain,96,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva
2,2,Macauley,US,96,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest
3,3,Ponzi,US,96,"This spent 20 months in 30% new French oak, an...",Reserve
4,4,Domaine de la Bégude,France,95,"This is the top wine from La Bégude, named aft...",La Brûlade


__Drop Unwanted columns__

In [5]:
wine = wine.drop('Unnamed: 0',axis = 1)
wine.head()

Unnamed: 0,winery,country,points,description,designation
0,Heitz,US,96,This tremendous 100% varietal wine hails from ...,Martha's Vineyard
1,Bodega Carmen Rodríguez,Spain,96,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva
2,Macauley,US,96,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest
3,Ponzi,US,96,"This spent 20 months in 30% new French oak, an...",Reserve
4,Domaine de la Bégude,France,95,"This is the top wine from La Bégude, named aft...",La Brûlade


__Using Pre-process File which you created in module 2 case study 1, call the ‘Refine’ function and get the pre-processed text for each ‘description’ in the csv file. Store it in a column named “Refined-Description”__

In [6]:
import nltk
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import  wordnet
from nltk.stem import WordNetLemmatizer

#Tokenize Function
def Tokenize(string):
    tokens=nltk.tokenize.word_tokenize(string)
    return " ".join(tokens)

#RemoveStopWordsFunction
def RemoveStopWords(string):
    #Removing Punctuations
    for each in punctuation:
        string=string.replace(each,"")
    
    #Removing Stopwords
    english_stopwords=stopwords.words('english')
    stopwords_removed_tokens=[]
    words=string.split(" ")
    
    for each in words:
        if each not in english_stopwords:
            stopwords_removed_tokens.append(each)
    return " ".join(stopwords_removed_tokens) 


#LemmatizeFunction
def Lemmatize(string):
    word_lem=WordNetLemmatizer()
    words=string.split() 
    lemmatizeWords=[]
    for each in words:
        lemmatizeWords.append(word_lem.lemmatize(each))
    return " ".join(lemmatizeWords)

def Refine(string):
    return Lemmatize(RemoveStopWords(Tokenize(string)))


__Apply the function__

In [7]:
#string = wine.description.values
wine['Refined-description'] = wine['description'].apply(Refine)
wine.head()


Unnamed: 0,winery,country,points,description,designation,Refined-description
0,Heitz,US,96,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,This tremendous 100 varietal wine hail Oakvill...
1,Bodega Carmen Rodríguez,Spain,96,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,Ripe aroma fig blackberry cassis softened swee...
2,Macauley,US,96,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,Mac Watson honor memory wine made mother treme...
3,Ponzi,US,96,"This spent 20 months in 30% new French oak, an...",Reserve,This spent 20 month 30 new French oak incorpor...
4,Domaine de la Bégude,France,95,"This is the top wine from La Bégude, named aft...",La Brûlade,This top wine La Bégude named highest point vi...


__Using ‘CountVectorization’ function from ‘Vectorization’ python file created in case study 1of this module, vectorise all the rows in ‘Refined-Description’ column which you created in the above step. Store them in a column named “CountVectorizer”__

In [8]:
def CountVectorization(S):
    from nltk import word_tokenize
    S_tokens = word_tokenize(S)
   
    S_corpus = set(S_tokens)
    S_corpus_dict = dict.fromkeys(S_corpus,0)
    S_dict = dict.fromkeys(S_corpus_dict,0)
    for token in S_tokens:
        S_dict[token]+=1
        S_list = list(S_dict.values())
    return (S_list)

In [9]:
wine['CountVectorizer'] = wine['Refined-description'].apply(CountVectorization)

In [10]:
wine.head()

Unnamed: 0,winery,country,points,description,designation,Refined-description,CountVectorizer
0,Heitz,US,96,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,This tremendous 100 varietal wine hail Oakvill...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,Bodega Carmen Rodríguez,Spain,96,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,Ripe aroma fig blackberry cassis softened swee...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,Macauley,US,96,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,Mac Watson honor memory wine made mother treme...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,Ponzi,US,96,"This spent 20 months in 30% new French oak, an...",Reserve,This spent 20 month 30 new French oak incorpor...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,Domaine de la Bégude,France,95,"This is the top wine from La Bégude, named aft...",La Brûlade,This top wine La Bégude named highest point vi...,"[1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


__Using ‘TF-IDFVectorization’ function from ‘Vectorization’ python file created in case study 1 of this module, vectorise all the rows in ‘Refined-Description’ column and store the results in ‘TF-IDF Vectorizer Column”__

In [11]:
def TFIDFVectorization(S):
    from sklearn.feature_extraction.text import TfidfVectorizer
    tf_vect = TfidfVectorizer(min_df = 1,lowercase = True,stop_words = 'english')
    tf_matrix = tf_vect.fit_transform([S])
    tf = tf_matrix.toarray()
    tf_list = tf.tolist()
    return tf_list

In [12]:
wine['TF-IDF Vectorizer Column'] = wine['Refined-description'].apply(TFIDFVectorization)
wine.head()

Unnamed: 0,winery,country,points,description,designation,Refined-description,CountVectorizer,TF-IDF Vectorizer Column
0,Heitz,US,96,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,This tremendous 100 varietal wine hail Oakvill...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[0.16222142113076254, 0.16222142113076254, 0...."
1,Bodega Carmen Rodríguez,Spain,96,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,Ripe aroma fig blackberry cassis softened swee...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[0.18257418583505536, 0.18257418583505536, 0...."
2,Macauley,US,96,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,Mac Watson honor memory wine made mother treme...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[0.18569533817705186, 0.18569533817705186, 0...."
3,Ponzi,US,96,"This spent 20 months in 30% new French oak, an...",Reserve,This spent 20 month 30 new French oak incorpor...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[0.15617376188860607, 0.15617376188860607, 0...."
4,Domaine de la Bégude,France,95,"This is the top wine from La Bégude, named aft...",La Brûlade,This top wine La Bégude named highest point vi...,"[1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[0.16222142113076254, 0.16222142113076254, 0...."


In [17]:
from collections import Counter
for groupname,subset in wine.groupby('country'):
    review_data = subset['Refined-description']
    words = []
    for word in review_data:
        words.extend(word.split(" "))
    print(groupname)
    print(Counter(words).most_common(15))

Argentina
[('flavor', 48), ('finish', 47), ('aroma', 42), ('This', 34), ('palate', 33), ('plum', 28), ('berry', 24), ('fruit', 20), ('note', 17), ('A', 17), ('feel', 14), ('acidity', 14), ('blackberry', 13), ('black', 12), ('The', 12)]
Australia
[('Drink', 5), ('The', 5), ('flavor', 4), ('Pinot', 3), ('Noir', 3), ('cherry', 3), ('note', 3), ('fruit', 3), ('finish', 3), ('black', 2), ('brown', 2), ('sugar', 2), ('roasted', 2), ('2020', 2), ('texture', 2)]
Austria
[('palate', 16), ('fruit', 13), ('wine', 13), ('The', 13), ('pear', 10), ('freshness', 9), ('nose', 9), ('fresh', 8), ('note', 7), ('This', 7), ('cherry', 6), ('structure', 6), ('Drink', 6), ('citrus', 6), ('A', 5)]
Bulgaria
[('cherry', 7), ('black', 7), ('dried', 6), ('This', 5), ('plum', 5), ('tannin', 4), ('It', 4), ('flavor', 4), ('Bulgarian', 3), ('soft', 3), ('aroma', 3), ('red', 3), ('Mavrud', 2), ('nose', 2), ('freshly', 2)]
Canada
[('wine', 5), ('fruit', 4), ('This', 2), ('Cabernet', 2), ('The', 2), ('flavor', 2), ('no

__Save changes to the CSV file.__

In [None]:
wine.to_csv('wine_updated.csv',index = False)

In [None]:
wine_updated = pd.read_csv('wine_updated.csv')
wine_updated.head()