In [None]:
import pandas as pd
import numpy as np
import random
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split

In [None]:
churn_2_classes = pd.read_csv('https://www.dropbox.com/s/4l8l5u9k5bpbk77/churn_2_classes.csv?dl=1', index_col=0)
churn_3_classes = pd.read_csv('https://www.dropbox.com/s/6lzrfy4afncpcta/churn_3_classes.csv?dl=1', index_col=0)

In [None]:
churn_2_classes

In [None]:
df1 = churn_2_classes[churn_2_classes['status'] == 'active']
df2 = churn_2_classes[churn_2_classes['status'] == 'cancelled']

In [None]:
df1 = df1.sample(frac=0.1) # frac = len(df2)/len(df1)

In [None]:
#now the classes are the same size
len(df1) == len(df2)

In [None]:
df = pd.concat([df1,df2], ignore_index=True)
df

In [None]:
churn_3_classes

In [None]:
churn_3_classes['status'].value_counts()

In [None]:
#harder with 3 classes

df1 = churn_3_classes[churn_3_classes['status'] == 'active']
df2 = churn_3_classes[churn_3_classes['status'] == 'cancelled']
df3 = churn_3_classes[churn_3_classes['status'] == 'paused']

In [None]:
df1.shape[0], df2.shape[0], df3.shape[0]

In [None]:
#in this case 
df1 = df1.sample(frac=0.1)
#also works...

In [None]:
df = pd.concat([df1,df2,df3], ignore_index=True)
df

In [None]:
df['status'].value_counts()

In [None]:
rfc = RandomForestClassifier(max_depth=10, n_estimators=100)

In [None]:
y = df['status']
le = preprocessing.LabelEncoder()
le.fit(y)

In [None]:
df.loc[:,'status'] = le.transform(df['status'])
df

In [None]:
X = df.drop(['status', 'median_order_type'], axis=1)

In [None]:
rfc.fit( X,y )

In [None]:
y = churn_2_classes['status']
le = preprocessing.LabelEncoder()
le.fit(y)

churn_2_classes.loc[:,'status'] = le.transform(churn_2_classes['status'])
churn_2_classes

In [None]:
X = churn_2_classes.drop(['status', 'median_order_type'], axis=1)

In [None]:
rfc = RandomForestClassifier(max_depth=10, n_estimators=10, class_weight="balanced")
rfc.fit( X, y )

In [None]:
scores = []
ss = ShuffleSplit(n_splits=3, test_size=0.25)

for train_index, test_index in ss.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    rfc.fit(X_train, y_train)
    
    preds = rfc.predict( X_test )

    scores.append(accuracy_score(preds, y_test))

In [None]:
#more valid, but harder to interpret
scores

In [None]:
rfc = RandomForestClassifier(max_depth=100, n_estimators=100, class_weight="balanced")
rfc.fit( X, y )

In [None]:
scores = []
ss = ShuffleSplit(n_splits=3, test_size=0.25)

for train_index, test_index in ss.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    rfc.fit(X_train, y_train)
    
    preds = rfc.predict( X_test )

    scores.append(accuracy_score(preds, y_test))

In [None]:
# better scores with more params
scores

In [None]:
#pivot tables

In [None]:
suppliers = pd.read_csv("https://www.dropbox.com/s/jjbacrux6bc1b4s/suppliers.csv?dl=1")
# demand = pd.read_csv("https://www.dropbox.com/s/0yikq3pugq87vl9/demand_forecasts.csv?dl=1")
# recipes = pd.read_csv("https://www.dropbox.com/s/az97voeh6i8x3i2/recipes.csv?dl=1")

In [None]:
suppliers

In [None]:
pd.pivot(suppliers, values="ingredient_uuid", columns=['supp_id'])

In [None]:
pd.pivot(suppliers, values="total_tax_per_unit", columns=['supp_id'])

In [None]:
pd.pivot(suppliers,index="ingredient_uuid", values="total_tax_per_unit", columns=['supp_id'])

In [None]:
#THIS LINE!!!!
pd.pivot(suppliers, index="ingredient_uuid", values="total_tax_per_unit", columns=['supp_id', 'country_of_origin'])

In [None]:
pd.pivot(suppliers, index="ingredient_uuid", values="time_to_deliver", columns=['supp_id'])

In [None]:
pd.pivot(suppliers, values="ingredient_uuid", columns=['supp_id'])

In [None]:
pd.pivot_table(suppliers, values="total_tax_per_unit", columns=['supp_id'], aggfunc=np.mean)

In [None]:
pd.pivot_table(suppliers, values="total_tax_per_unit", columns=['supp_id'], aggfunc=np.max)

In [None]:
pd.pivot_table(suppliers, values="total_tax_per_unit", columns=['supp_id'], aggfunc=np.min)

In [None]:
pd.pivot(suppliers, values="total_tax_per_unit", columns=['ingredient_uuid'])

In [None]:
pd.pivot_table(suppliers, values="total_tax_per_unit", columns=['ingredient_uuid'], aggfunc=np.min)

In [None]:
pd.pivot_table(suppliers, values=["total_tax_per_unit", "time_to_deliver"], columns=['supp_id'], aggfunc=np.min)

In [None]:
pd.pivot_table(suppliers, values=["total_tax_per_unit"], columns=['supp_id', 'ingredient_uuid'], aggfunc=np.min)

In [None]:
pd.pivot(suppliers, values=["total_tax_per_unit"], columns=['supp_id', 'ingredient_uuid'])

In [None]:
p=pd.pivot_table(suppliers, values=["total_tax_per_unit"], columns=['supp_id', 'ingredient_uuid'], aggfunc=np.min)

In [None]:
p.T

In [None]:
q=suppliers.set_index(['supp_id', 'ingredient_uuid'])
q

In [None]:
q.unstack()

In [None]:
q.stack().to_frame(name="some new name")

In [None]:
q.unstack(0)

In [None]:
q.unstack(1)

In [None]:
q.melt()

In [None]:
suppliers.melt()

In [None]:
suppliers.melt(id_vars=['supp_id'])

In [None]:
suppliers.melt(id_vars=['supp_id', 'ingredient_uuid'])

In [None]:
suppliers.melt(id_vars=['supp_id', 'ingredient_uuid']).T