In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
complete_df = pd.read_csv('./energydata_complete.csv')

In [None]:
complete_df.head(10)

In [None]:
complete_df.describe()

In [None]:
complete_df.dtypes

In [None]:
complete_df.corr()

In [None]:
plt.figure(figsize = (10,8))
plt.scatter(range(complete_df.shape[0]),np.sort(complete_df.Appliances.values))
plt.xlabel('index',fontsize=12)
plt.ylabel('appliances',fontsize=12)
plt.show()

In [None]:
ulimit = np.percentile(complete_df.Appliances.values,99)
llimit = np.percentile(complete_df.Appliances.values,1)
complete_df['Appliances'].ix[complete_df['Appliances'] > ulimit] = ulimit
complete_df['Appliances'].ix[complete_df['Appliances'] < llimit] = llimit

plt.figure(figsize=(12,8))
sns.distplot(complete_df.Appliances.values, bins=50, kde=False)
plt.xlabel('Appliances', fontsize=12)
plt.show()


In [None]:
df_copy = complete_df.copy()
x_cols = [col for col in df_copy if col not in ['Appliances'] if df_copy[col].dtype == 'float64']

labels = []
values = []
for col in x_cols:
    labels.append(col)
    values.append(np.corrcoef(df_copy[col].values, df_copy.Appliances.values)[0,1])
corr_df = pd.DataFrame({'col_labels':labels, 'corr_values':values})
corr_df = corr_df.sort_values(by = 'corr_values')

index = np.arange(len(labels))
width=0.9
fig, ax = plt.subplots(figsize=(12,48))
rects = ax.barh(index, np.array(corr_df.corr_values.values), color = 'y')
ax.set_yticks(index)
ax.set_yticklabels(corr_df.col_labels.values, rotation = 'horizontal')
ax.set_xlabel('Correlation coefficient')
ax.set_title('Correlation coefficient of the variales')

plt.show()



In [None]:
corr_df_sel = corr_df.ix[(corr_df['corr_values']>0.05) | (corr_df['corr_values'] < -0.05)]
corr_df_sel

In [None]:
cols_to_use = corr_df_sel.col_labels.tolist()

temp_df = df_copy[cols_to_use]
corrmat = temp_df.corr(method='spearman')
f, ax = plt.subplots(figsize = (12,12))

sns.heatmap(corrmat, vmax = 1., square = True)
plt.title('Important variales correlation map', fontsize = 15)
plt.show()

In [None]:
df_copy.dtypes

# Import missingno & Pandas profiling

In [None]:
import missingno as msno
import pandas_profiling

In [None]:
msno.matrix(df_copy,
           figsize = (16,7),
           width_ratios=(15,1))
# From the following figure we can see that there is no missing data in the dataset

In [None]:
msno.bar(df_copy,
        figsize=(16,7))

In [None]:
from pandas.tools.plotting import scatter_matrix

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
scatter_matrix(df_copy[['T1','T2','T3','T4']], alpha=0.2, diagonal='hist', ax=ax)

# alpha: 图像透明度
# diagonal: {'hist','kde'} hist:直方图 kde: kernal density estimation:核密度估计

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.regplot('T1', 'T3', data=df_copy, ax=ax)
ax.set_ylabel("T1[°C]")
ax.set_xlabel("T3 [°C]")
fig.tight_layout()

# from this chart we can check for if there are outliers in the x vs y relationship

# Pandas profiling

In [None]:
pandas_profiling.ProfileReport(df_copy)

In [None]:
df_copy.date.dtypes

In [None]:
df_copy['date_parsed'] = pd.to_datetime(df_copy['date'], infer_datetime_format = True)
df_copy.date_parsed.head(10)

# convert the datatype of date from object to datetime

In [None]:
from sklearn import datasets
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
dataset = datasets.load_iris()
model = ExtraTreesClassifier()
model.fit(dataset.data, dataset.target)
print(model.feature_importances_)