In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
data = pd.read_csv(r"C:\Users\other user\Downloads\pokemon.csv")
data

In [None]:
data.info()

In [None]:
data.corr() #part of predective modelling will discss in details later

In [None]:
#correlation map
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(data.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax) #remove annot=True, and cee the difference
plt.show()

In [None]:
data.head(10)

In [None]:
data.columns

In [None]:
#DATA VISUALIZATION: Matplot is a python library that help us to plot data. The easiest and basic plots are line,
#scatter and histogram plots.
#Line plot is better when x axis is time.
#Scatter is better when there is correlation between two variables
#Histogram is better when we need to see distribution of numerical data.
#Customization: Colors,labels,thickness of line, title, opacity, grid, figsize, ticks of axis and linestyle

In [None]:
# Line Plot
# color = color, label = label, linewidth = width of line, alpha = opacity, grid = grid, linestyle = sytle of line
data.Speed.plot(kind = 'line', color = 'g',label = 'Speed',linewidth=1,alpha = 0.5,grid = True,linestyle = ':')
data.Defense.plot(color = 'r',label = 'Defense',linewidth=1, alpha = 0.5,grid = True,linestyle = '-.')
plt.legend(loc='upper right')     # legend = puts label into plot
plt.xlabel('x axis')              # label = name of label
plt.ylabel('y axis')
plt.title('Line Plot')            # title = title of plot
plt.show()

In [None]:
# Scatter Plot 
# x = attack, y = defense
data.plot(kind='scatter', x='Attack', y='Defense',alpha = 0.5,color = 'red')
plt.xlabel('Attack')              # label = name of label
plt.ylabel('Defence')
plt.title('Attack Defense Scatter Plot')            # title = title of plot

In [None]:
# Histogram
# bins = number of bar in figure
data.Speed.plot(kind = 'hist',bins = 50,figsize = (12,12))
plt.show()

In [None]:
# clf() = cleans it up again you can start a fresh
data.Speed.plot(kind = 'hist',bins = 50)
plt.clf()
# We cannot see plot due to clf()


In [None]:
# 1 - Filtering Pandas data frame
x = data['Defense']>200     # There are only 3 pokemons who have higher defense value than 200
data[x]

In [None]:
# 2 - Filtering pandas with logical_and
# There are only 2 pokemons who have higher defence value than 2oo and higher attack value than 100
data[np.logical_and(data['Defense']>200, data['Attack']>100 )]

In [None]:
# This is also same with previous code line. Therefore we can also use '&' for filtering.
data[(data['Defense']>200) & (data['Attack']>100)]

In [None]:
# lets return pokemon csv and make one more list comprehension example
# lets classify pokemons whether they have high or low speed. Our threshold is average speed.
threshold = sum(data.Speed)/len(data.Speed)
data["speed_level"] = ["high" if i > threshold else "low" for i in data.Speed]
data.loc[:10,["speed_level","Speed"]] # we will learn loc more detailed later

In [None]:
# For example lets look frequency of pokemom types
print(data['Type 1'].value_counts(dropna =False))  # if there are nan values that also be counted
# As it can be seen below there are 112 water pokemon or 70 grass pokemon

In [None]:
# For example max HP is 255 or min defense is 5
data.describe() #ignore null entries

# VISUAL EXPLORATORY DATA ANALYSIS
Box plots: visualize basic statistics like outliers, min/max or quantiles

In [None]:
# For example: compare attack of pokemons that are legendary  or not
# Black line at top is max
# Blue line at top is 75%
# Red line is median (50%)
# Blue line at bottom is 25%
# Black line at bottom is min
# There are no outliers
data.boxplot(column='Attack',by = 'Legendary')

In [None]:
# Plotting all data 
data1 = data.loc[:,["Attack","Defense","Speed"]]
data1.plot()
# it is confusing

In [None]:
# subplots
data1.plot(subplots = True)
plt.show()

In [None]:
# scatter plot  
data1.plot(kind = "scatter",x="Attack",y = "Defense")
plt.show()

In [None]:
# hist plot  
data1.plot(kind = "hist",y = "Defense",bins = 50,range= (0,250),density = True)

In [None]:
# histogram subplot with non cumulative and cumulative
fig, axes = plt.subplots(nrows=2,ncols=1)
data1.plot(kind = "hist",y = "Defense",bins = 50,range= (0,250),density = True,ax = axes[0])
data1.plot(kind = "hist",y = "Defense",bins = 50,range= (0,250),density = True,ax = axes[1],cumulative = True)
plt.savefig('graph.png')
plt

In [None]:
# In order to practice lets take head of pokemon data and add it a time list
data2 = data.head()
date_list = ["1992-01-10","1992-02-10","1992-03-10","1993-03-15","1993-03-16"]
datetime_object = pd.to_datetime(date_list)
data2["date"] = datetime_object
# lets make date as index
data2= data2.set_index("date")
data2 

In [None]:
# Now we can select according to our date index
print(data2.loc["1993-03-16"])
print(data2.loc["1992-03-10":"1993-03-16"])

In [None]:
# We will use data2 that we create at previous part
data2.resample("A").mean()

In [None]:
# Lets resample with month
data2.resample("M").mean()
# As you can see there are a lot of nan because data2 does not include all months

In [None]:
# In real life (data is real. Not created from us like data2) we can solve this problem with interpolate
# We can interpolete from first value
data2.resample("M").first().interpolate("linear")

In [None]:
# Or we can interpolate with mean()
data2.resample("M").mean().interpolate("linear")

# CONCATENATING DATA
We can concatenate two dataframe 

In [None]:
# Firstly lets create 2 data frame
data1 = data.head()
data2= data.tail()
conc_data_row = pd.concat([data1,data2],axis =0,ignore_index =True) # axis = 0 : adds dataframes in row
conc_data_row

In [None]:
data1 = data['Attack'].head()
data2= data['Defense'].head()
conc_data_col = pd.concat([data1,data2],axis =1) # axis = 0 : adds dataframes in row
conc_data_col

In [None]:
# lets convert object(str) to categorical and int to float.
data['Type 1'] = data['Type 1'].astype('category')
data['Speed'] = data['Speed'].astype('float')

In [None]:
# As you can see Type 1 is converted from object to categorical
# And Speed ,s converted from int to float
data.dtypes

In [None]:
# Lets look at does pokemon data have nan value
# As you can see there are 800 entries. However Type 2 has 414 non-null object so it has 386 null object.
data.info()

In [None]:
# Lets checK Type 2
data["Type 2"].value_counts(dropna =False)
# As you can see, there are 386 NAN value

In [None]:
# Lets drop nan values
data1=data   # also we will use data to fill missing value so I assign it to data1 variable
data1["Type 2"].dropna(inplace = True)  # inplace = True means we do not assign it to new variable. Changes automatically assigned to data
# So does it work ?

In [None]:
#INDEXING DATA FRAMES
#Indexing using square brackets
#Using column attribute and row label
#Using loc accessor
#Selecting only some columns

In [None]:
data = pd.read_csv(r"D:\DATA SCIENCE\data set\python/pokemon.csv")
data= data.set_index("#")
data.head()

In [None]:
# indexing using square brackets
data["Speed"][1]

In [None]:
# using column attribute and row label
data.HP[1]

In [None]:
# using loc accessor
data.loc[1,["HP"]]

In [None]:
# using loc accessor
data.loc[1,["HP"]]

In [None]:
# Difference between selecting columns: series and dataframes
print(type(data["HP"]))     # series
print(type(data[["HP"]]))   # data frames

In [None]:
# Slicing and indexing series
data.loc[1:10,"HP":"Defense"]   # 10 and "Defense" are inclusive

In [None]:
# Reverse slicing 
data.loc[10:1:-1,"HP":"Defense"] 

In [None]:
# From something to end
data.loc[1:10,"Speed":] 

In [None]:
# Creating boolean series
boolean = data.HP > 200
data[boolean]

In [None]:
# Combining filters
first_filter = data.HP > 150
second_filter = data.Speed > 35
data[first_filter & second_filter]

In [None]:
# Filtering column based others
data.HP[data.Speed<15]

In [None]:
# Python functions
def div(n):
    return n/2
data.HP.apply(div)

In [None]:
# Or we can use lambda function
data.HP.apply(lambda n : n/2)

In [None]:
# Defining column using other columns
data["total_power"] = data.Attack + data.Defense
data.head()

In [None]:
# our index name is this:
print(data.index.name)
# lets change it
data.index.name = "index_name"
data.head()

In [None]:
# Overwrite index
# if we want to modify index we need to change all of them.
data.head()
# first copy of our data to data3 then change index 
data3 = data.copy()
# lets make index start from 100. It is not remarkable change but it is just example
data3.index = range(100,900,1)
data3.head()

In [None]:
# We can make one of the column as index. I actually did it at the beginning of manipulating data frames with pandas section
# It was like this
# data= data.set_index("#")
# also you can use 
# data.index = data["#"]

In [None]:
# Setting index : type 1 is outer type 2 is inner index
data1 = data.set_index(["Type 1","Type 2"]) 
data1.head(100)
# data1.loc["Fire","Flying"] # howw to use indexes