# Capstone III - Data Wrangling
## Data taken from CSV from export of User Utility Database

### Pull in appropriate libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import mode
import os
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_auc_score 

### Import Data

In [2]:
data = pd.read_csv('Dashboard Views.csv',encoding='cp1252')

In [3]:
data.head()

Unnamed: 0,Standard Date,Year,Month,Country,Industry,Company,Dashboard Name,Dashboard Page Views,Dashboard Requests,Dashboard View Render Time millisec
0,1/3/2021,2021,3,Australia,Consulting,KPMG,Project Analysis,3,4,30234
1,1/3/2021,2021,3,Australia,Other,APA Group,Wind Technical Analysis,1,1,5322
2,1/3/2021,2021,3,Australia,Other,Octopus Investments,Country Analysis,5,4,13380
3,1/3/2021,2021,3,Australia,Other,SunPower,Country Analysis,3,2,9694
4,1/3/2021,2021,3,Australia,Other,SunPower,Solar PV Technical Analysis,0,1,4361


### Check and Prepare Data

In [4]:
#Check Datatypes
data.dtypes

Standard Date                          object
Year                                    int64
Month                                   int64
Country                                object
Industry                               object
Company                                object
Dashboard Name                         object
Dashboard Page Views                    int64
Dashboard Requests                      int64
Dashboard View Render Time millisec     int64
dtype: object

In [5]:
#Confirm no string in series
data['Standard Date'].unique()

array(['1/3/2021', '2/3/2021', '3/3/2021', '4/3/2021', '5/3/2021',
       '6/3/2021', '7/3/2021', '8/3/2021', '9/3/2021', '10/3/2021',
       '11/3/2021', '12/3/2021', '13/03/2021', '14/03/2021', '15/03/2021',
       '16/03/2021', '17/03/2021', '18/03/2021', '19/03/2021',
       '21/03/2021', '22/03/2021', '23/03/2021', '24/03/2021',
       '25/03/2021', '26/03/2021', '27/03/2021', '28/03/2021',
       '29/03/2021', '30/03/2021', '31/03/2021', '1/4/2021', '2/4/2021',
       '4/4/2021', '5/4/2021', '6/4/2021', '7/4/2021', '8/4/2021',
       '9/4/2021', '11/4/2021', '12/4/2021', '13/04/2021', '14/04/2021',
       '15/04/2021', '16/04/2021', '17/04/2021', '18/04/2021',
       '19/04/2021', '20/04/2021', '21/04/2021', '22/04/2021',
       '23/04/2021', '24/04/2021', '25/04/2021', '26/04/2021',
       '27/04/2021', '28/04/2021', '29/04/2021', '30/04/2021', '1/5/2021',
       '2/5/2021', '3/5/2021', '4/5/2021', '5/5/2021', '6/5/2021',
       '7/5/2021', '8/5/2021', '10/5/2021', '11/5/2021', 

In [6]:
#Change Standard Date to datetime dtype
data['Standard Date'] = pd.to_datetime(data['Standard Date'])
data.dtypes

Standard Date                          datetime64[ns]
Year                                            int64
Month                                           int64
Country                                        object
Industry                                       object
Company                                        object
Dashboard Name                                 object
Dashboard Page Views                            int64
Dashboard Requests                              int64
Dashboard View Render Time millisec             int64
dtype: object

In [7]:
#Get info of database
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2310 entries, 0 to 2309
Data columns (total 10 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   Standard Date                        2310 non-null   datetime64[ns]
 1   Year                                 2310 non-null   int64         
 2   Month                                2310 non-null   int64         
 3   Country                              2310 non-null   object        
 4   Industry                             2310 non-null   object        
 5   Company                              2310 non-null   object        
 6   Dashboard Name                       2310 non-null   object        
 7   Dashboard Page Views                 2310 non-null   int64         
 8   Dashboard Requests                   2310 non-null   int64         
 9   Dashboard View Render Time millisec  2310 non-null   int64         
dtypes: datetime6

In [8]:
#Check for missing values
data.isnull().sum()

Standard Date                          0
Year                                   0
Month                                  0
Country                                0
Industry                               0
Company                                0
Dashboard Name                         0
Dashboard Page Views                   0
Dashboard Requests                     0
Dashboard View Render Time millisec    0
dtype: int64

In [9]:
#Change column names for ease
columns = data.columns
print(columns)
new_columns = pd.Series(columns).str.lower()
new_columns[0] = 'date'
new_columns[6] = 'dashname'
new_columns[7] = 'views'
new_columns[8] = 'requests'
new_columns[9] = 'rendertime'
data.columns = new_columns
data.columns

Index(['Standard Date', 'Year', 'Month', 'Country', 'Industry', 'Company',
       'Dashboard Name', 'Dashboard Page Views', 'Dashboard Requests',
       'Dashboard View Render Time millisec'],
      dtype='object')


Index(['date', 'year', 'month', 'country', 'industry', 'company', 'dashname',
       'views', 'requests', 'rendertime'],
      dtype='object')

In [10]:
#check on shape
data.shape

(2310, 10)

In [11]:
#Change render time from milliseconds to seconds
data.rendertime = data.rendertime / 1000
data.head()

Unnamed: 0,date,year,month,country,industry,company,dashname,views,requests,rendertime
0,2021-01-03,2021,3,Australia,Consulting,KPMG,Project Analysis,3,4,30.234
1,2021-01-03,2021,3,Australia,Other,APA Group,Wind Technical Analysis,1,1,5.322
2,2021-01-03,2021,3,Australia,Other,Octopus Investments,Country Analysis,5,4,13.38
3,2021-01-03,2021,3,Australia,Other,SunPower,Country Analysis,3,2,9.694
4,2021-01-03,2021,3,Australia,Other,SunPower,Solar PV Technical Analysis,0,1,4.361


In [15]:
datapath = '../data'
if not os.path.exists(datapath):
    os.mkdir(datapath)

In [17]:
datapath_dashboards = os.path.join(datapath, 'Dashboard Views_Cleaned.csv')
if not os.path.exists(datapath_dashboards):
    data.to_csv(datapath_dashboards, index=False)