- mad2306: Manas Dresswala

- pds2136: Prasham Dhaneshbhai Sheth

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler,Normalizer
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.compose import make_column_transformer
from sklearn.linear_model import Ridge,LinearRegression

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer


# 1.1

In [2]:
data = pd.read_csv('winemag-data-130k-v2.csv')

In [3]:
data = data.drop(columns = ['Unnamed: 0'])
data = data.loc[data['country'] == 'US']

In [4]:
# number of missing values in each column
data.isnull().sum()

country                      0
description                  0
designation              17596
points                       0
price                      239
province                     0
region_1                   278
region_2                  3993
taster_name              16774
taster_twitter_handle    19763
title                        0
variety                      0
winery                       0
dtype: int64

In [5]:
# below columns have a lot of missing values and the value of country for all is 'US' this dropping the respective columns
data = data.drop(columns = ['taster_name', 'taster_twitter_handle', 'designation', 'country']) 

# removing text data for the first part
data1 = data.drop(columns = ['description', 'title'])

# dropping the rows with missing values
data1 = data1.dropna() 

In [6]:
y1 = data1[['points']]
X1 = data1.drop(columns = 'points')

In [7]:
preprocess = make_column_transformer((StandardScaler(), ['price']), remainder = TargetEncoder())

X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state = 42)

In [8]:
# baseline using Linear Regression
pipe = make_pipeline(preprocess, LinearRegression())
score = cross_val_score(pipe, X_train, y_train, cv = 5)
pipe.fit(X_train,y_train)

print("The cross-validation score for Linear Regression is: "+ str(np.mean(score)))
print("The score on test set is: " + str(pipe.score(X_test,y_test)))

The cross-validation score for Linear Regression is: 0.39475384732478214
The score on test set is: 0.4215588941233599


In [9]:
# Ridge
pipe = make_pipeline(preprocess, Ridge())
score = cross_val_score(pipe, X_train, y_train, cv = 5)
pipe.fit(X_train,y_train)

print("The cross-validation score for Ridge Regression is: "+ str(np.mean(score)))
print("The score on test set is: " + str(pipe.score(X_test,y_test)))

The cross-validation score for Ridge Regression is: 0.3947542271071985
The score on test set is: 0.4215590685281599


# 1.2

In [10]:
text_data = data[['description', 'points']]

In [11]:
y2 = text_data[['points']]
X2 = text_data.drop(columns = 'points')

X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, random_state = 42)

In [12]:
X_train = X_train['description'].tolist()
X_test = X_test['description'].tolist()

In [13]:
vect = CountVectorizer()
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [14]:
lr = LinearRegression().fit(X_train, y_train)
s1 = lr.score(X_test, y_test)
print("Score from Linear Regression on test set is: " +str(s1))

Score from Linear Regression on test set is: 0.5199730911806023


In [15]:
r = Ridge().fit(X_train, y_train)
s1 = r.score(X_test, y_test)
print("Score from Ridge Regression on test set is: " +str(s1))

Score from Ridge Regression on test set is: 0.675389761886473


We see that with using text features, the R2 score is increasing from 0.42 to 0.67. 
Next, we noticed there are a lot of duplicate values in the 'description' feature, so we dropped them and again run our model.

In [16]:
text_data = text_data.drop_duplicates('description', keep=False)

In [17]:
y2 = text_data[['points']]
X2 = text_data.drop(columns = 'points')

X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, random_state = 42)

In [18]:
X_train = X_train['description'].tolist()
X_test = X_test['description'].tolist()

In [19]:
vect = CountVectorizer()
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [20]:
lr = LinearRegression().fit(X_train, y_train)
s1 = lr.score(X_test, y_test)
print("Score from Linear Regression on test set is: " +str(s1))

Score from Linear Regression on test set is: 0.468991714814975


In [21]:
r = Ridge().fit(X_train, y_train)
s1 = r.score(X_test, y_test)
print("Score from Ridge Regression on test set is: " +str(s1))

Score from Ridge Regression on test set is: 0.6761203670215854


We see that after dropping duplicates, our r2 increases by 0.01.
So moving forward in 1.3 we use the data after removing the duplicates.

# 1.3

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, random_state = 42)
X_train = X_train['description'].tolist()
X_test = X_test['description'].tolist()

In [23]:
# adding n-grams with minimum frequency = 4
vect = CountVectorizer(ngram_range=(1, 3), min_df=4)
X_train_ngram = vect.fit_transform(X_train)
X_test_ngram = vect.transform(X_test)

r = Ridge().fit(X_train_ngram, y_train)
s1 = r.score(X_test_ngram, y_test)
print("Score from Ridge Regression with default parameter setting on test set is: " +str(s1))

param_grid = [{'alpha':  np.logspace(-2,2,10)}]
grid = GridSearchCV(Ridge(), param_grid, cv = 5)
grid.fit(X_train_ngram, y_train)
print("The best parameters after grid search: " + str(grid.best_params_))
print("Score from Ridge Regression with default parameter setting on test set is: " 
      +str(grid.score(X_test_ngram, y_test)))

Score from Ridge Regression with default parameter setting on test set is: 0.5888761331386896
The best parameters after grid search: {'alpha': 35.93813663804626}
Score from Ridge Regression with default parameter setting on test set is: 0.7141821011858287


In [24]:
# adding tf-idf rescaling
tfidf = make_pipeline(CountVectorizer(ngram_range=(1, 3), min_df=4), TfidfTransformer())
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

r = Ridge().fit(X_train_tfidf, y_train)
s1 = r.score(X_test_tfidf, y_test)
print("Score from Ridge Regression with default parameter setting on test set is: " +str(s1))

param_grid = [{'alpha': np.logspace(-2,2,10)}]
grid = GridSearchCV(Ridge(), param_grid, cv = 5)
grid.fit(X_train_tfidf, y_train)
print("The best parameters after grid search: " + str(grid.best_params_))
print("Score from Ridge Regression with default parameter setting on test set is: " 
      +str(grid.score(X_test_tfidf, y_test)))

Score from Ridge Regression with default parameter setting on test set is: 0.715050198339527
The best parameters after grid search: {'alpha': 0.5994842503189409}
Score from Ridge Regression with default parameter setting on test set is: 0.7127196343560669


In [25]:
# adding characters
char = make_pipeline(CountVectorizer(ngram_range=(1, 3), min_df=4, analyzer="char_wb"), TfidfTransformer())
X_train_char = char.fit_transform(X_train)
X_test_char = char.transform(X_test)

r = Ridge().fit(X_train_char, y_train)
s1 = r.score(X_test_char, y_test)
print("Score from Ridge Regression with default parameter setting on test set is: " +str(s1))

param_grid = [{'alpha': np.logspace(-2,2,10)}]
grid = GridSearchCV(Ridge(), param_grid, cv = 5)
grid.fit(X_train_char, y_train)
print("The best parameters after grid search: " + str(grid.best_params_))
print("Score from Ridge Regression with default parameter setting on test set is: " 
      +str(grid.score(X_test_char, y_test)))

Score from Ridge Regression with default parameter setting on test set is: 0.6846310265706672
The best parameters after grid search: {'alpha': 0.21544346900318834}
Score from Ridge Regression with default parameter setting on test set is: 0.6931825711320354


In [26]:
# removing stop words and trying the previous model
char1 = make_pipeline(CountVectorizer(ngram_range=(1, 3), min_df=4, stop_words="english", analyzer="char_wb"), TfidfTransformer())
X_train_char1 = char1.fit_transform(X_train)
X_test_char1 = char1.transform(X_test)

r = Ridge().fit(X_train_char1, y_train)
s1 = r.score(X_test_char1, y_test)
print("Score from Ridge Regression with default parameter setting on test set is: " +str(s1))

param_grid = [{'alpha': np.logspace(-2,2,10)}]
grid = GridSearchCV(Ridge(), param_grid, cv = 5)
grid.fit(X_train_char1, y_train)
print("The best parameters after grid search: " + str(grid.best_params_))
print("Score from Ridge Regression with default parameter setting on test set is: " 
      +str(grid.score(X_test_char1, y_test)))

Score from Ridge Regression with default parameter setting on test set is: 0.6846310265706672
The best parameters after grid search: {'alpha': 0.21544346900318834}
Score from Ridge Regression with default parameter setting on test set is: 0.6931825711320354


In [27]:
# removing charater analysis from the aboove model 
char1 = make_pipeline(CountVectorizer(ngram_range=(1, 3), min_df=4, stop_words="english"), TfidfTransformer())
X_train_char1 = char1.fit_transform(X_train)
X_test_char1 = char1.transform(X_test)

r = Ridge().fit(X_train_char1, y_train)
s1 = r.score(X_test_char1, y_test)
print("Score from Ridge Regression with default parameter setting on test set is: " +str(s1))

param_grid = [{'alpha':np.logspace(-2,2,10)}]
grid = GridSearchCV(Ridge(), param_grid, cv = 5)
grid.fit(X_train_char1, y_train)
print("The best parameters after grid search: " + str(grid.best_params_))
print("Score from Ridge Regression with default parameter setting on test set is: " 
      +str(grid.score(X_test_char1, y_test)))

Score from Ridge Regression with default parameter setting on test set is: 0.7004719750951894
The best parameters after grid search: {'alpha': 0.5994842503189409}
Score from Ridge Regression with default parameter setting on test set is: 0.6959058917654448


In [28]:
# adding Normalizer to the aboove model 
char2 = make_pipeline(CountVectorizer(ngram_range=(1, 3), min_df=4, stop_words="english"),Normalizer(), TfidfTransformer())
X_train_char2 = char2.fit_transform(X_train)
X_test_char2 = char2.transform(X_test)

r = Ridge().fit(X_train_char2, y_train)
s1 = r.score(X_test_char2, y_test)
print("Score from Ridge Regression with default parameter setting on test set is: " +str(s1))

param_grid = [{'alpha': np.logspace(-2,2,10)}]
grid = GridSearchCV(Ridge(), param_grid, cv = 5)
grid.fit(X_train_char2, y_train)
print("The best parameters after grid search: " + str(grid.best_params_))
print("Score from Ridge Regression with default parameter setting on test set is: " 
      +str(grid.score(X_test_char2, y_test)))

Score from Ridge Regression with default parameter setting on test set is: 0.7004720431097023
The best parameters after grid search: {'alpha': 0.5994842503189409}
Score from Ridge Regression with default parameter setting on test set is: 0.6959062801460703


After looking at all the models, we see that the best R2 is coming when we used the following - 
1. n-gram ranging from 1-3
2. Minimum frequency = 4
3. Removed stop words
4. Normalizing the bag of words vector
5. Tranforming the tokens using tf-idf transformer

We can also see that the regularization parameter changes based on the setting we used for tuning the bag to words model.

# 1.4

In [29]:
# below columns have a lot of missing values and the value of country for all is 'US' this dropping the respective columns
data2 = data.drop(columns = ['title']) 

# dropping the rows with missing values
data2 = data2.dropna()
data2 = (data2.drop_duplicates(keep = 'first'))

In [30]:
# adding non-text features
y3 = data2[['points']]
X3 = data2.drop(columns = 'points')

# splitting the data into training, validation and testing
X_train, X_test, y_train, y_test = train_test_split(X3, y3, test_size=0.2, random_state = 42)

In [31]:
text_pipeline = make_pipeline(CountVectorizer(ngram_range=(1, 3), min_df = 4),Normalizer(),TfidfTransformer())
preprocess = make_column_transformer((StandardScaler(), ['price']),
                                     (text_pipeline, 'description'),
                                     remainder = TargetEncoder())

pipe = make_pipeline(preprocess, Ridge())
pipe.fit(X_train, y_train)
print("The score on test set with default parameters is: " + str(pipe.score(X_test,y_test)))

The score on test set with default parameters is: 0.7674700896851882


In [32]:
pipeline_new = Pipeline([('processor', preprocess),('regressor', Ridge())])
param_grid = [{'regressor__alpha': np.logspace(-2,2,10)}]
grid = GridSearchCV(pipeline_new, param_grid, cv = 5)
grid.fit(X_train, y_train)
print("The best parameters after grid search: " + str(grid.best_params_))
print("The score on test set after grid search is " + str(grid.score(X_test, y_test)))

The best parameters after grid search: {'regressor__alpha': 0.5994842503189409}
The score on test set after grid search is 0.7658084506602063


When we combine text and non-text features, we can see that the R2 is increasing to 0.7674.