In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Uploading data

In [None]:
full_data = pd.read_csv('winemag-data-130k-v2.csv')
full_data

# Cleaning data

In [None]:
clean_data = full_data.drop(['Unnamed: 0','taster_twitter_handle','designation','title'], axis = 1)

In [None]:
clean_data.head()

In [None]:
#quantifying the description column by making a is_fruity and a is_dry column
clean_data.description = clean_data.description.str.lower()
clean_data['is_fruity'] = clean_data.description.str.contains('fruit|ripe|berry|orange|plum|cherry|citrus|prune|currant|peach|jam')
clean_data['is_dry'] = clean_data.description.str.contains('dry|tannin|astringent|herb|earth|chalk|austere')
clean_data = clean_data.drop(['description'],axis = 1)
clean_data['Number'] = pd.Series([1]*full_data.size)
clean_data.head()

# Summary Stats, Pandas function, and MatPlotLib

### Fruitiness of wine based on country of origin. Double click on large plots to scroll through.

In [None]:
mean_by_country = clean_data.groupby('country').mean()
plt.figure(figsize=(80,10))
plt.bar(mean_by_country.index,mean_by_country.is_fruity)
plt.ylim((0,1))
plt.ylabel('Mean of is fruity')
plt.xlabel('Country of origin')
plt.title('Wine Fruitiness vs. Country of Origin (all countries)')

This graph (names of countries are visible in downloaded image) shows little significant difference in wine fruitiness between countries. Most countries produce more fruity wine than non-fruity wine.

In [None]:
#top twelve wine producing countries
#first twelve countries of origin have number of wine bottles above 1000, so these will be the data used to provide more meaningful analysis
top_twelve_countries = clean_data.groupby('country').sum().Number.sort_values(ascending=False)[0:12].index
top_countries_mean = mean_by_country.loc[top_twelve_countries]
plt.figure(figsize=(15,5))
plt.bar(top_countries_mean.index,top_countries_mean.is_fruity)
plt.ylim((0,1))
plt.ylabel('Mean of is fruity')
plt.xlabel('Country of origin')
plt.title('Wine Fruitiness vs. Country of Origin (top 12 Countries)')

This graph does not provide more meaningful or different interpretations of the data.

### Dryness of wine based on country of origin

In [None]:
mean_by_country = clean_data.groupby('country').mean()
plt.figure(figsize=(80,10))
plt.bar(mean_by_country.index,mean_by_country.is_dry)
plt.ylim((0,1))
plt.ylabel('Mean of is dry')
plt.xlabel('Country of origin')
plt.title('Wine Dryness vs Country of Origin (all countries)')

This plot shows more differences between wine dryness and country of origin. China and Slovakia appear to produce more dry wines than sweet wines compared to other countries. England, Greece, Macedonia, Moldova, Serbia, Switzerland, and Ukraine appear to produce far more sweet wines than dry wines compared to the other countries. However, many of these countries, including Slovakia as well as the countries with no bars likely do not produce significant quanitities of wine to begin with. To see how different countries affect the global wine market in terms of sweet and dry wines, we should look at a bar plot with only the top 12 wine producing countries.

In [None]:
plt.figure(figsize=(15,5))
plt.bar(top_countries_mean.index,top_countries_mean.is_dry)
plt.ylim((0,1))
plt.ylabel('Mean of is dry')
plt.xlabel('Country of origin')
plt.title('Wine Dryness vs. Country of Origin (top 12 countries)')

### Points of Wine Based on Country of Origin

In [None]:
mean_by_country = clean_data.groupby('country').mean()
plt.figure(figsize=(80,10))
plt.bar(mean_by_country.index,mean_by_country.points)
plt.ylim((80,100))
plt.ylabel('Mean number of points')
plt.xlabel('Country of origin')
plt.title('Average Points vs Country of Origin (all countries)')

Brazilian, Egyptian, Mexican, Peruvian, and Ukranian wines are not well liked. 

In [None]:
plt.figure(figsize=(15,5))
plt.bar(top_countries_mean.index,top_countries_mean.points)
plt.ylim((80,100))
plt.ylabel('Mean number of points')
plt.xlabel('Country of origin')
plt.title('Average Points vs Country of Origin (top 12 countries)')

However, wines from countries that produce most of the world's wine appear to have similar average point values. 

### Does the price of the wine affect its points?

In [None]:
plt.scatter(clean_data.points,clean_data.price,s = 10,alpha=0.5)
plt.title('Wine Mag Points vs. Price of Wine')
plt.xlabel('Points')
plt.ylabel('Price ($)')

There appears to be some correlation between the price of the wine and the number of points it recieves, but it's not hugely significant.

# Sklearn

I am trying to determine which factors most influence the number of points a wine recieves. This esentially means looking at the price, fruitiness, and dryness of the wine because data like country of origin and variety of wine have not been quanitified. This is a supervised learning task because there are specified input (price, fruitiness, dryness) and output (points) variables with which I tried to find a relationship between. I am going to use logistic regression.

In [None]:
import sklearn

In [None]:
numericdata = clean_data[['price','is_fruity','is_dry','points']]
numericdata = numericdata.dropna(axis = 0)
numericdata.is_fruity = numericdata.is_fruity.astype(float)
numericdata.is_dry = numericdata.is_dry.astype(float)
X = numericdata[['price','is_fruity','is_dry']].values
y = numericdata[['points']].values
feature_names = np.array(['price','is_fruity','is_dry'])
feature_names

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression()

In [None]:
reg.fit(X_train,y_train)

In [None]:
reg.score(X_train,y_train)
#this is a terrible fit but ¯\_(ツ)_/¯

In [None]:
preds = reg.predict(X_test)

In [None]:
reg.score(X_test,y_test)
#the fits are similarly terrible for both data sets

In [None]:
reg.coef_ = np.squeeze(reg.coef_)

In [None]:
pd.Series(reg.coef_,index=feature_names)

The results show that the price of the wine has very little to do with the number of points it receives. The coefficients for is_fruity and is_dry show that the fruitiness of the wine plays the largest role in its point value and the dryness plays a slightly lesser role. 