Importing the Data and EDA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score

In [None]:
testing = pd.read_csv('data/Test data.csv')
training = pd.read_csv('data/Training data.csv')
target = pd.read_csv('data/Training data target.csv')

In [None]:
testing.head(1)

In [None]:
training.head(1)

In [None]:
target.head(1)

Here I begin analyzing the columns in the dataset that I was assigned:

In [None]:
df= training[['basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga', 'ward', 'population', 'public_meeting', 'recorded_by']]
df.info()

In [None]:
df['basin'].value_counts()
sns.countplot(x=df['basin'])

In [None]:
df['subvillage'].value_counts()
sns.barplot(x=df['subvillage'][:5], y=df['population'])

In [None]:
df['region'].value_counts()
sns.countplot(x=df['region'])

In [None]:
df['region_code'].value_counts()
sns.countplot(x=df['region_code'])

In [None]:
df['district_code'].value_counts()
sns.countplot(x=df['district_code'])

In [None]:
#df['lga'].value_counts()
df.groupby(['lga']).sum().plot()

In [None]:
#df['ward'].value_counts()
#df_ward = df.groupby(['ward']).count()
df.groupby(['ward']).sum().plot()

In [None]:
df['population'].value_counts()
sns.barplot(x=df['population'])

In [None]:
df['public_meeting'].value_counts()
sns.countplot(x=df['public_meeting'])

In [None]:
df['recorded_by'].value_counts()
sns.countplot(x=df['recorded_by'])

Now moving on to adding some visuals.

In [None]:
#sns.scatter(pd.concat([training, target], axis=1), y_vars=['status_group'])

Now on to training a model. 

In [None]:
X = training.drop(columns=['id', 'date_recorded', 'longitude', 'latitude', 'recorded_by'], axis= 1)

In [None]:
y = target['status_group']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 ,random_state=500)

In [None]:
X.info()

In [None]:
# Transformer requires Index type
categorical_columns= X.select_dtypes(include= 'object').columns
numeric_columns= X.select_dtypes(exclude= 'object').columns

In [None]:
categorical_columns

In [None]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median", add_indicator=True)), ("scaler", StandardScaler())])

categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='constant')), ('encoder', OneHotEncoder(handle_unknown="ignore"))])

#target_transformer = LabelEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_columns),
        ("cat", categorical_transformer, categorical_columns)])

In [None]:
logreg_model = Pipeline(steps=[('clean', preprocessor), ('logreg', LogisticRegression(random_state=500))])

logreg_model.fit(X_train, y_train)

logreg_model.score(X_train, y_train)