# 1. Data Cleaning

### Count Nulls

In [None]:
df.isnull().sum()

### Fill NAs with Numeric Mean

In [None]:
df = df.fillna(df.mean())

### Removing columns with more than half the rows having NA

In [None]:
half_count = len(df) / 2
df = df.dropna(thresh=half_count, axis=1)

### Select Only Columns of A Certain Type

In [None]:
numeric_df = df.select_dtypes(include=["int","float"], exclude=None)

# 2. Data Visualization

### Making Figures and Subplots

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np

fig = plt.figure(figsize=(15,10))

gradients = [5,10,15,20,25,30]
x = np.linspace(-2*np.pi,2*np.pi,200)

for i,g in enumerate(gradients):
    ax = fig.add_subplot(3,2,i+1) # (Rows, Columns, Graph Num)
    ax.set_title("$sin({}x)$".format(g))
    ax.plot(np.sin(g*x))

### Correlation Heatmap

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.heatmap(df.corr())

# 3. Data Preparation

### Reshuffle Data Set [DataFrame]

In [None]:
shuffled_rows = np.random.permutation(df.index)
shuffled_df = df.iloc[shuffled_rows]

# OR

df = df.reindex(numpy.random.permutation(df.index))

### Split and Shuffle [DataFrame]

In [None]:
# Splitting the Data Set
train = df.sample(frac=0.8, random_state=1)
test = df[~df.index.isin(train.index)]

### Split and Shuffle [Arrays]

In [None]:
from sklearn.model_selection import train_test_split # need scikit-learn 0.19.2 for shuffle

X=[1,2,3,4,5,6,7,8,9,10]
y = ['one', 'two', 'three','four','five','six','seven','eight','nine','ten']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=None, random_state=1, shuffle=True)

print(X_train, X_test)
print(y_train, y_test)

# 3. Machine Learning

### K-Fold Cross Validation [MSE] + KNN Regression

In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsRegressor

kf = KFold(n_splits=5, shuffle=True, random_state=1)
knn = KNeighborsRegressor()
mses = cross_val_score(estimator=knn, X=dc_listings[["accommodates"]], y=dc_listings["price"], scoring="neg_mean_squared_error", cv=kf)
rmses = np.sqrt(abs(mses))
avg_rmse = np.mean(rmses)

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

final_corr_cols = strong_corrs.drop(['Garage Cars', 'TotRms AbvGrd'])
features = final_corr_cols.drop(['SalePrice']).index
target = 'SalePrice'

test = test[final_corr_cols.index]
clean_test = test.dropna(axis=0)

lm = LinearRegression()
lm.fit(train[features],train[target])
train_predictions = lm.predict(train[features])
train_rmse = np.sqrt(mean_squared_error(train["SalePrice"],train_predictions))

### Dummy Categorical Variables

In [None]:
dummy_cylinders = pd.get_dummies(cars["cylinders"], prefix="cyl")
cars = pd.concat([cars, dummy_cylinders], axis=1)

dummy_years = pd.get_dummies(cars["year"], prefix="year")
cars = pd.concat([cars, dummy_years], axis=1)

cars.drop(["year","cylinders"], inplace=True, axis=1)

### Categories to Numerical

In [None]:
for col_name in ["workclass", "education", "marital_status", "occupation",
                 "relationship", "race", "sex", "native_country",
                 "high_income"]:
    categories = income[col_name].astype("category")
    income[col_name] = categories.cat.codes 

### Random Choice Centroids

In [None]:
num_clusters = 5
# Use numpy's random function to generate a list, length: num_clusters, of indices
random_initial_points = np.random.choice(point_guards.index, size=num_clusters)
# Use the random indices to create the centroids
centroids = point_guards.loc[random_initial_points]

### Entropy
-summation of (prob*log(prob)) for all unique classes

In [None]:
prob_1 = income[income["high_income"]==1].shape[0]/income.shape[0]
prob_0 = income[income["high_income"]==0].shape[0]/income.shape[0]


income_entropy = -( (prob_1*math.log(prob_1,2)) + (prob_0*math.log(prob_0,2)) )

### Information Gain
entropy - summation( class_weight * entropy(class) ) for each new class generated in the split

# A. Miscellaneous

### Smallest or Largest KEY in Dict

In [None]:
abd = {0: 8.165359086946255, 1: 4.041494056879455, 2: 3.025911506812519, 3: 17.652380991140518, 4: 14.111415188949815}
print(min(abd, key=abd.get))
print(max(abd, key=abd.get))

### Finding Index of Max in List

In [None]:
highest_gain_index = information_gains.index(max(information_gains))

### Setting Pandas Maximum Columns Option

In [None]:
pd.set_option("display.max_columns", 53)
pd.get_option("display.max_columns")