### Import Libraries

In [None]:
import pandas as pd # load and manipulate data
import numpy as np # usually helps with mathematical operations on data
from sklearn.model_selection import train_test_split # split your dataset into train and test set
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer # check accuracy score of your classification model
from sklearn.model_selection import GridSearchCV # cross validation
from sklearn.metrics import confusion_matrix # creates a confusion matrix
from sklearn.metrics import plot_confusion_matrix # confusion matrix as a heatmap
from sklearn.metrics import classification_report # report of your classification model's accuracy
from scipy.stats import chi2_contingency # chi-sq test to check relation between two cat columns. p value >0.05 means no relation
from xgboost import XGBClassifier # import the XGBoost Classifier method

### print settings

In [None]:
# set the max number of rows or columns to print for a dataframe
pd.set_option("max_rows", None)
pd.set_option("max_columns", None)

### import dataset

In [None]:
data = pd.read_csv("<file location>")
data.head()

### Pandas manupulation

#### Drop columns

In [None]:
# drop columns with same values in all rows
data.drop(columns=data.columns[data.nunique() == 1], inplace=True)
# drop columns with the mentioned column names
data.drop(columns=['<col1>','<col2>','<col3>'], inplace=True)

In [None]:
# unique values in all columns
data.apply(lambda col: col.unique(), axis=0)

#### Change column datatype

In [None]:
# change datatype of a column to object
data[['<col1>','<col2>','<col3>']] = data[['<col1>','<col2>','<col3>']].astype('object')
# change datatype of a column to number. errors = 'coerce' replaces string or error values with nan
data['<col1>'] = pd.to_numeric(data['<col1>'], errors='coerce')

#### list of specific columns

In [None]:
# get list of categorical and numeric columns from your dataframe
catCols = X.iloc[:,1:].select_dtypes(include='object').columns
numCols = X.select_dtypes(include='number').columns

### Check correlation and significance

#### Correlation between categorical columns

In [None]:
CrosstabResult = pd.crosstab(index=data['<CatCol1>'] ,columns=data['<CatCol2>'])
ChiSqResult = chi2_contingency(CrosstabResult)
print('The P-Value of the ChiSq Test is:', ChiSqResult[1])
# P value of <0.05 indicates that we can reject null hypothesis that the two variables are correlated

### Label Encoding

In [None]:
# label encoding using replace method
data['<CatCol1>'] = data['<CatCol1>'].replace({'<Val1>':1, '<Val2>':0})

### One hot encoding

In [None]:
# one hot encoding for categorical columns
for col in catCols:
    dummy_cols = pd.get_dummies(data[col], prefix=col)
    data = data.join(dummy_cols)
    data = data.drop(columns = col)