The source code for this project could be found [here](https://github.com/Xianzhiwang1/ml-0451-final-proj). 



# Rough skeleton code harvested from the penguin blog post for easy reference

In [1]:
from matplotlib import pyplot as plt
import numpy as np
from final_project_code import FinalProject 

In [2]:
import pandas as pd

train_url = "https://raw.githubusercontent.com/middlebury-csci-0451/CSCI-0451/main/data/palmer-penguins/train.csv"
train = pd.read_csv(train_url)

In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(train["Species"])

def prepare_data(df):
  df = df.drop(["studyName", "Sample Number", "Individual ID", "Date Egg", "Comments", "Region"], axis = 1)
  df = df[df["Sex"] != "."]
  df = df.dropna()
  y = le.transform(df["Species"])
  df = df.drop(["Species"], axis = 1)
  df = pd.get_dummies(df)
  return df, y

X_train, y_train = prepare_data(train)

In [4]:
from itertools import combinations

# these are not actually all the columns: you'll 
# need to add any of the other ones you want to search for
all_qual_cols = ["Clutch Completion", "Sex"]
all_quant_cols = ['Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)']

for qual in all_qual_cols: 
  qual_cols = [col for col in X_train.columns if qual in col ]
  for pair in combinations(all_quant_cols, 2):
    cols = qual_cols + list(pair) 
    print(cols)
    # you could train models and score them here, keeping the list of 
    # columns for the model that has the best score. 
    # 

['Clutch Completion_No', 'Clutch Completion_Yes', 'Culmen Length (mm)', 'Culmen Depth (mm)']
['Clutch Completion_No', 'Clutch Completion_Yes', 'Culmen Length (mm)', 'Flipper Length (mm)']
['Clutch Completion_No', 'Clutch Completion_Yes', 'Culmen Depth (mm)', 'Flipper Length (mm)']
['Sex_FEMALE', 'Sex_MALE', 'Culmen Length (mm)', 'Culmen Depth (mm)']
['Sex_FEMALE', 'Sex_MALE', 'Culmen Length (mm)', 'Flipper Length (mm)']
['Sex_FEMALE', 'Sex_MALE', 'Culmen Depth (mm)', 'Flipper Length (mm)']


In [5]:
from sklearn.linear_model import LogisticRegression

# this counts as 3 features because the two Clutch Completion 
# columns are transformations of a single original measurement. 
# you should find a way to automatically select some better columns
# as suggested in the code block above
cols = ["Flipper Length (mm)", "Body Mass (g)", "Clutch Completion_No", "Clutch Completion_Yes"]

LR = LogisticRegression()
LR.fit(X_train[cols], y_train)
LR.score(X_train[cols], y_train)

0.6640625

In [6]:
from matplotlib.patches import Patch

def plot_regions(model, X, y):
    
    x0 = X[X.columns[0]]
    x1 = X[X.columns[1]]
    qual_features = X.columns[2:]
    
    fig, axarr = plt.subplots(1, len(qual_features), figsize = (7, 3))

    # create a grid
    grid_x = np.linspace(x0.min(),x0.max(),501)
    grid_y = np.linspace(x1.min(),x1.max(),501)
    xx, yy = np.meshgrid(grid_x, grid_y)
    
    XX = xx.ravel()
    YY = yy.ravel()

    for i in range(len(qual_features)):
      XY = pd.DataFrame({
          X.columns[0] : XX,
          X.columns[1] : YY
      })

      for j in qual_features:
        XY[j] = 0

      XY[qual_features[i]] = 1

      p = model.predict(XY)
      p = p.reshape(xx.shape)
      
      
      # use contour plot to visualize the predictions
      axarr[i].contourf(xx, yy, p, cmap = "jet", alpha = 0.2, vmin = 0, vmax = 2)
      
      ix = X[qual_features[i]] == 1
      # plot the data
      axarr[i].scatter(x0[ix], x1[ix], c = y[ix], cmap = "jet", vmin = 0, vmax = 2)
      
      axarr[i].set(xlabel = X.columns[0], 
            ylabel  = X.columns[1])
      
      patches = []
      for color, spec in zip(["red", "green", "blue"], ["Adelie", "Chinstrap", "Gentoo"]):
        patches.append(Patch(color = color, label = spec))

      plt.legend(title = "Species", handles = patches, loc = "best")
      
      plt.tight_layout()

# Predicting whether Russian Factories want to incorporate or not


`$$\mathcal{L}(a, b) = \sum_{i = 1}^n (ax_i + b - y_i)^2$$` 

renders to: 

$$\mathcal{L}(a, b) = \sum_{i = 1}^n (ax_i + b - y_i)^2\;.$$

# get the data in

In [7]:
import pandas as pd

train_url = "./../Vermont_farm_tables/1850Midd.csv"
train = pd.read_csv(train_url)

In [8]:
train

Unnamed: 0,pg_num,rown,town,Name,improved,unimproved,cash_val,machine_val,horse,mules,...,hemp_other,flax,flaxseed,silk,maplesugar,canesugar,molasses,beewax,honey,manufactures_val
0,31,1,Middlebury,Horaliz Seymore,48,,2500,100,3.0,,...,0,0,0,0,0,0,0,0,40,1850
1,31,2,Middlebury,Semauel Seming,12,38.0,550,20,1.0,,...,0,0,0,0,0,0,0,0,30,1850
2,31,3,Middlebury,Machus Bass,35,,1000,15,1.0,,...,0,0,0,0,0,0,0,0,25,1850
3,31,4,Middlebury,James the Sonold,55,5.0,3000,100,2.0,,...,0,0,0,0,0,0,0,3,80,1850
4,31,5,Middlebury,James Miacham,60,,3000,50,3.0,,...,0,0,0,0,0,0,0,0,13,1850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,39,2,Middlebury,Sam S Surrand,60,40.0,2000,50,1.0,0.0,...,0,0,0,0,0,0,0,20,40,1850
166,39,3,Middlebury,Naham Parker,25,0.0,1000,30,5.0,0.0,...,0,0,0,0,0,0,0,0,50,1850
167,39,4,Middlebury,Sanil Bickwell S,60,13.0,1800,40,0.0,0.0,...,0,0,0,0,0,0,0,0,200,1850
168,39,5,Middlebury,Charlis R Ford,100,43.0,3000,55,9.0,0.0,...,0,0,0,0,0,0,0,0,150,1850


In [9]:
# Rvss = pd.io.stata.read_stata("./../Rvssian/AG_Corp_RuscorpMasterFile_Cleaned.dta")
# Rvss.to_csv("RvssianCorpMasterFileCleaned.csv")

In [10]:
# Rvss_data = pd.io.stata.read_stata("./AG_Corp_Prod_Database.dta")
# Rvss_data.to_csv("AG_Corp_Prod_DataBase.csv")

In [11]:
Rvss = pd.read_csv("./AG_Corp_Prod_DataBase.csv")

  Rvss = pd.read_csv("./AG_Corp_Prod_DataBase.csv")


In [12]:
Rvss.head()

Unnamed: 0.1,Unnamed: 0,id,Form,PSZ,PSZ1900,FoundingYear,Province,Region,Industry,OntheSide,...,Animal,Wool,Cotton,MixedMaterials,Wood,Paper,MetalsandMachines,Foods,Chemical,Mineral
0,0,4156,0,,,1860.0,30,CentralIndustrial,Paper,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,52010,0,,,,51,Previslitskii,Foods A,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2,30937,0,,,,41,CentralBlacksoil,Foods A,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,3,39923,0,,,,3,Caucasus,Metals and Machines,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,4,3296,0,,,1882.0,11,Northwestern,Chemicals,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
Rvss.columns

Index(['Unnamed: 0', 'id', 'Form', 'PSZ', 'PSZ1900', 'FoundingYear',
       'Province', 'Region', 'Industry', 'OntheSide', 'Age', 'TaxedActivity',
       'YEAR', 'PSZLastYear', 'PSZ1908', 'SubindustryCode', 'STCAP', 'Revenue',
       'TotalWorkers', 'TotalPower', 'GrandTotalWorkers', 'RevperWorker',
       'PowerperWorker', 'RevperGrandWorker', 'PowerperGrandWorker',
       'logRevperWorker', 'logPowerperWorker', 'logRevperGrandWorker',
       'logPowerperGrandWorker', 'logRev', 'logWorkers', 'logPower',
       'RegIndGroup', 'RegIndYearGroup', 'ProvIndGroup', 'ProvIndYearGroup',
       'IndYearGroup', 'IndustryFactor', 'ProvinceFactor', 'YearFactor',
       'AKTS', 'PAI', 'factory_id', 'FormNextYear', 'FormNextNextYear',
       'FactoryisCorpin1894', 'FormNextYearin1894', 'FactoryisCorpin1900',
       'FormNextYearin1900', 'FactoryisCorpin1908', 'NEWDEV', 'SHARES',
       'STPRICE', 'BONDS', 'Silk', 'Flax', 'Animal', 'Wool', 'Cotton',
       'MixedMaterials', 'Wood', 'Paper', 'Metalsa

In [14]:
df = Rvss[["Form", "id", "FoundingYear", "Industry", "TotalWorkers", "TotalPower", "GrandTotalWorkers", "logWorkers", "Mineral"]]

In [15]:
df_inc = df.loc[df['Form'] == 1]

In [16]:
df_not_inc = df.loc[df['Form'] == 0]

In [17]:
print(f"df incorporated have {df_inc.shape[0]} many rows")

df incorporated have 2393 many rows


In [18]:
df_not_inc = df_not_inc.sample(n=2393, replace=False)

In [19]:
print(f"df not incorporated have {df_not_inc.shape[0]} many rows")
frames = [df_inc, df_not_inc]
result = pd.concat(frames)

df not incorporated have 2393 many rows


In [20]:
result['Form'].mean()

0.5

In [21]:
compression_opts = dict(method='zip', archive_name='artificially_balanced_Rvssian_Factory.csv')
result.to_csv('artificially_balanced_Rvssian_Factory.zip', index = False, compression=compression_opts)

In [22]:
result = result.sample(frac=1).reset_index(drop=True)

In [24]:
result.head(10)

Unnamed: 0,Form,id,FoundingYear,Industry,TotalWorkers,TotalPower,GrandTotalWorkers,logWorkers,Mineral
0,1,38510,,Flax,1380.0,500.0,1380.0,7.229839,0.0
1,1,52600,,Foods A,105.0,0.0,105.0,4.65396,0.0
2,1,619,1857.0,Cotton,2007.0,731.0,2007.0,7.604396,0.0
3,1,3304,1892.0,Chemicals,349.0,521.0,349.0,5.855072,0.0
4,0,3103,1878.0,Chemicals,3.0,0.0,3.0,1.098612,0.0
5,1,24610,,Mineral Products,83.0,,83.0,4.41884,1.0
6,0,42114,,Wool,42.0,60.0,42.0,3.73767,0.0
7,0,40568,,Metals and Machines,36.0,12.0,36.0,3.583519,0.0
8,1,36404,,Paper,68.0,50.0,68.0,4.219508,0.0
9,0,48129,,Wood,21.0,12.0,21.0,3.044522,0.0


In [25]:
FP = FinalProject()
train, validate, test = FP.split_data(result)

In [26]:
df_train, X_train, y_train = FP.prepare_data(train)
df_validate, X_validate, y_validate= FP.prepare_data(validate)
df_test, X_test, y_test = FP.prepare_data(test)