In [None]:
# Import libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Read data into pandas dataframe
kick_data = pd.read_csv("/Users/Nicole/Documents/Kickstarter/Kickstarter002.csv")
kick_data = kick_data.append(pd.read_csv("/Users/Nicole/Documents/Kickstarter/Kickstarter003.csv"))
kick_data = kick_data.append(pd.read_csv("/Users/Nicole/Documents/Kickstarter/Kickstarter004.csv"))
kick_data = kick_data.append(pd.read_csv("/Users/Nicole/Documents/Kickstarter/Kickstarter005.csv"))
kick_data = kick_data.append(pd.read_csv("/Users/Nicole/Documents/Kickstarter/Kickstarter006.csv"))
kick_data = kick_data.append(pd.read_csv("/Users/Nicole/Documents/Kickstarter/Kickstarter007.csv"))
kick_data = kick_data.append(pd.read_csv("/Users/Nicole/Documents/Kickstarter/Kickstarter008.csv"))
kick_data = kick_data.append(pd.read_csv("/Users/Nicole/Documents/Kickstarter/Kickstarter009.csv"))
kick_data = kick_data.append(pd.read_csv("/Users/Nicole/Documents/Kickstarter/Kickstarter010.csv"))
kick_data = kick_data.append(pd.read_csv("/Users/Nicole/Documents/Kickstarter/Kickstarter011.csv"))
kick_data = kick_data.append(pd.read_csv("/Users/Nicole/Documents/Kickstarter/Kickstarter012.csv"))

# Isolate relevant attributes
kick_data = kick_data.loc[:, ['backers_count','blurb','category','country', 'goal', 
                              'staff_pick', 'state', 'usd_pledged']]

# remove live entries as their final state is unknown
kick_data = kick_data[kick_data.state != 'live']
kick_data = kick_data.reset_index(drop=True)

# initial count will return 40169 for most categories, 40167 for blurb column
kick_data.count()

In [None]:
# values in state column changed to integers for correlation analysis
kick_data.loc[:,"state"] = kick_data.loc[:,"state"].apply(lambda x: 1 if x == 'successful' else 0)

# remove rows with missing values
kick_data = kick_data.dropna()

# check that state values are int
kick_data.info() 

In [None]:
# Count entries per value in category column
kick_data['category'].value_counts()
# Return entries per value in category column with a count of less than a certain number
kick_data['category'].value_counts()[kick_data['category'].value_counts()<1]
# Count how many unique values in category column
kick_data['category'].nunique()

In [None]:
# Count entries per value in country column
kick_data['country'].value_counts()

In [None]:
# Count entries per value in staff_pick column
kick_data['staff_pick'].value_counts()

In [None]:
# Count entries per value in state column
kick_data['state'].value_counts()

In [None]:
# Construct seaborn jointplot
sns.jointplot(data=kick_data, x='backers_count', y='usd_pledged', color='b')
plt.show()

In [None]:
# Construct seaborn heatmap to get correlations between attributes
sns.heatmap(kick_data.corr(), annot=True, cmap='PRGn')
plt.show()

In [None]:
# Construct seaborn boxplot to see how being a staff pick affects how much is pledged
sns.boxplot(x="staff_pick", y="usd_pledged", data=kick_data)
plt.show()

In [None]:
# Construct seaborn boxplot to see how country affects how much is pledged
sns.boxplot(x="country", y="usd_pledged", data=kick_data)
plt.show()

In [None]:
# Construct heatmap to show median value of usd_pledged based on country and staff pick
kick_pt = kick_data.pivot_table(index='country', columns='staff_pick', values='usd_pledged', aggfunc=np.median)
sns.heatmap(kick_pt, annot=True, fmt=".2f", cmap = "PRGn")
plt.show()

In [None]:
# Count entries per country based on staff_pick
kick_pt = kick_data.pivot_table(index='country', columns='staff_pick', aggfunc='size')
sns.heatmap(kick_pt, annot=True, fmt=".0f", cmap = "PRGn")
plt.show()

In [None]:
# Construct heatmap to show maximum value of usd_pledged based on country and staff pick
kick_pt = kick_data.pivot_table(index='country', columns='state', values='usd_pledged', aggfunc=np.max)
sns.heatmap(kick_pt, annot=True, fmt=".2f", cmap = "PRGn")
plt.show()

In [None]:
# feature variables
z = 32000

x_test = kick_data.loc[:z, kick_data.columns != 'state']
x_train = kick_data.loc[z:, kick_data.columns != 'state']

# response variable
y_test = kick_data.loc[:z, 'state'] 
y_train = kick_data.loc[z:, 'state']

In [None]:
# This file simulates a data by creating a vector 't', and the vector
# 'y' by setting y=3+4*t + eps 
# where 'eps' is a random error following normal distribution with mean
# zero and standard deviation 3. We use this data to create a kNN
# regression model.
# 
# The kknn library must be installed. It allows both knn regression and
# classification
library(kknn)

tt=seq(-5,5,by=0.5)
yy=4*tt+3+rnorm(length(tt),0,sd=3.5)
trainDat=data.frame(t=tt,y=yy)
testDat=data.frame(t=seq(-5,5,by=0.01),y=3+4*seq(-5,5,by=0.01)+rnorm(length(seq(-5,5,by=0.01)),0,3))
# Experiment with different values of k and see the results.
k1<-as.numeric(readline(paste0("Write k between 1 and ", length(tt),'\n')))
repeat{
# Now build the knn regression model. you should set
# kernel="rectangular" otherwise, the kknn method will use a bit more
# sophistcated versions of knn and not the one covered in class

knnRegModel<-kknn(y~t, train=trainDat, test=testDat,k=k1,kernel="rectangular")

# Create a data frame for predicted coordinates
datPred<-data.frame(t=seq(-5,5,by=0.01),y=predict(knnRegModel, data=testDat))

###############################################################
# To use the ordinary R graphics functions  un-comment the    #
# following lines, and comment out the lines below for the    #
# ggplot2 package                                             #
###############################################################

plot(tt,yy,col="blue", main=paste(k1,"-NN Regression"))
#readline()
lines(testDat$t,predict(knnRegModel, data=testDat) ,col="red")


###############################################################
# To use ggplot2 library un-comment the following lines, and  #
# comment out the lines above for the regular R graphic       #
# functions.                                                  #
###############################################################

# The ggplot2 library is a nicer looking graphics package. Its product
# looks a bit more professional looking. Install and run this library
# Use the following for ggplot package plotting. 
#library(ggplot2)
#sp<-ggplot(trainDat,aes(x=t,y=y))
#sp<-sp+geom_point(color=I("blue"))+
#     geom_line(data=datPred,aes(x=t,y=y),color=I("red"))
#sp<-sp+ggtitle(paste(k1,"-NN"))
# To see the actual graph in ggplot you must use "print" otherwise no
# drawing will take place:
#print(sp)
k1<-as.numeric(readline(paste0("Write k between 1 and ", length(tt),'\n')))
if (k1<1) break
}
