# Command Reference Sheet

## Step 1: Data Preparation

### Importing our Data

`read_csv`:
```python
import pandas as pd

df = pd.read_csv('file_name.csv')
```

`read_excel`:
```python
import pandas as pd

df = pd.read_excel('file_name.xls')
```

`read_json`:
```python
import pandas as pd

df = pd.read_json('file_name.json')
```

### Cleaning the Data

`dropna`:
```python
df = df.dropna()
```

`SimpleImputer`
```python
from sklearn.impute import SimpleImputer
import numpy as np

imputer = SimpleImputer(missing_values=[np.nan], strategy='mean')
df = imputer.fit_transform(df)
```

### Preparing Data

`sklearn.preprocessing.normalize`
```python
from sklearn.preprocessing import normalize

tmp_cols = df_feature.columns
df_feature = normalize(df_feature, axis=0)

# Reconstruct the dataframe
df_feature = pd.DataFrame(data=df_feature, columns=tmp_cols)
del tmp_cols
```

`sklearn.feature_selection.VarianceThreshold`
```python
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold()
selector.fit_transform(df_feature)

# Reconstruct the dataframe
df_feature = pd.DataFrame(data=df_feature, columns=selector.feature_names_in_)
```

`sklearn.feature_selection.SelectKBest`
```python
from sklearn.feature_selection import SelectKBest

selector = SelectKBest()
df_feature = selector.fit_transform(df_feature, k=5) # Change 'k' to be the number of features you want to conserve

# Reconstruct the dataframe
df_feature = pd.DataFrame(data=df_feature, columns=selector.feature_names_in_)
```

`sklearn.feature_selection.SelectPercentile`
```python
from sklearn.feature_selection import SelectPercentile

selector = SelectPercentile()
df_feature = selector.fit_transform(df_feature, percentile=0.5) # Change 'percentile' to be the proportion you want to conserve

# Reconstruct the dataframe
df_feature = pd.DataFrame(data=df_feature, columns=selector.feature_names_in_)
```

`sklearn.decomposition.PCA`
```python
from sklearn.decomposition import PCA

reducer = PCA(n_components=10) # Change 'n_components' to be the number of features you want (1 or more), OR a proportion of variance you want conserve (0 to 1)
df_feature = reducer.fit_transform(df_feature)

# Restore it to dataframe format
col_names = []
for i in range(5):
    col_names.append(f"pc{i}")

df_feature = pd.DataFrame(data=df_feature, columns=col_names)
```

`sklearn.decomposition.LinearDiscriminantAnalysis`
```python
from sklearn.decomposition import LinearDiscriminantAnalysis as LDA

reducer = LDA(n_components=10) # Change 'n_components' to be the number of features you want (1 or more)
df_feature = reducer.fit_transform(df_feature)

# Restore it to dataframe format
col_names = []
for i in range(5):
    col_names.append(f"ld{i}")

df_feature = pd.DataFrame(data=df_feature, columns=col_names)
```

`sklearn.preprocessing.LabelEncoder`
```python
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder() # Change 'n_components' to be the number of features you want (1 or more)
df_target = encoder.fit_transform(df_target)
```

## Model Selection and Training

### Train-Test Split

`sklearn.model_selection.train_test_split`
```python
from sklearn.model_selection import train_test_split

df_feature_train, df_feature_test, df_target_train, df_target_test = train_test_split(df_feature, df_target)
```

### Continuous Models

`sklearn.linear_model.LinearRegression`
```python
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(df_feature_train, df_target_train)
```

`sklearn.svm.SVR`
```python
from sklearn.svm import SVR

model = SVR()
model.fit(df_feature_train, df_target_train)
```

`sklearn.ensemble.RandomForestRegressor`
```python
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(df_feature_train, df_target_train)
```

### Categorical Models

`sklearn.linear_model.LogisticRegression`
```python
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(df_feature_train, df_target_train)
```

`sklearn.svm.SVC`
```python
from sklearn.svm import SVC

model = SVC()
model.fit(df_feature_train, df_target_train)
```

`sklearn.ensemble.RandomForestClassifier`
```python
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(df_feature_train, df_target_train)
```

## Model Evaluation

### Continuous Evaluations

`sklearn.metrics.mean_squared_error`
```python
from sklearn.metrics import mean_squared_error

model_predicted_test = model.predict(df_feature_train)
print(mean_squared_error(model_predicted_test, df_test_target))
```

`sklearn.metrics.mean_absolute_error`
```python
from sklearn.metrics import mean_absolute_error

model_predicted_test = model.predict(df_feature_train)
print(mean_absolute_error(model_predicted_test, df_test_target))
```

`sklearn.metrics.r2_score`
```python
from sklearn.metrics import r2_score

model_predicted_test = model.predict(df_feature_train)
print(r2_score(model_predicted_test, df_test_target))
```

### Categorical Evaluations

`sklearn.metrics.accuracy_score`
```python
from sklearn.metrics import accuracy_score

model_predicted_test = model.predict(df_feature_train)
print(accuracy_score(model_predicted_test, df_test_target))
```

`sklearn.metrics.accuracy_score`
```python
from sklearn.metrics import balanced_accuracy_score

model_predicted_test = model.predict(df_feature_train)
print(balanced_accuracy_score(model_predicted_test, df_test_target))
```

`sklearn.metrics.roc_auc_score`
```python
from sklearn.metrics import roc_auc_score

model_predicted_test = model.predict(df_feature_train)
print(roc_auc_score(model_predicted_test, df_test_target))
```

`sklearn.metrics.recall_score`
```python
from sklearn.metrics import recall_score

model_predicted_test = model.predict(df_feature_train)
print(recall_score(model_predicted_test, df_test_target))
```

`sklearn.metrics.precision_score`
```python
from sklearn.metrics import precision_score

model_predicted_test = model.predict(df_feature_train)
print(precision_score(model_predicted_test, df_test_target))
```

`sklearn.metrics.classification_report`
```python
from sklearn.metrics import classification_report

model_predicted_test = model.predict(df_feature_train)
print(classification_report(model_predicted_test, df_test_target))
```