# NumPy - Numeric Python
Docs  
https://www.numpy.org/  
  
**Numpy is the core library for scientific computing in Python. It provides a high-performance multidimensional array object, and tools for working with these arrays.**  
### Main advantages of NumPy arrays over python lists
1. Computations are faster
2. NumPy provides a set of practical and very easy to use tools for performing calculations over entire arrays

In [None]:
import numpy as np
from random import randint
from datetime import datetime

# testing performance difference between python lists and numpy array
def get_list_with_x_random_elements(x):
    return [randint(1, 9) for i in range(x)]

list_a = get_list_with_x_random_elements(1000000)
list_b = get_list_with_x_random_elements(1000000)
np_array_a = np.array(list_a)
np_array_b = np.array(list_b)

def calculate_performance_for_python_list():
    dt1 = datetime.now()
    result = [(a / b) ** 2 for a, b in zip(list_a, list_b)]
    dt2 = datetime.now()
    return (dt2 - dt1).microseconds

def calculate_performance_for_numpy_array():
    dt1 = datetime.now()
    result = (np_array_a / np_array_b) ** 2
    dt2 = datetime.now()
    return (dt2 - dt1).microseconds

In [None]:
python_list_result = [calculate_performance_for_python_list() for i in range(20)]
numpy_array_result = [calculate_performance_for_numpy_array() for i in range(20)]

In [None]:
# Calculating mean for both results
print("Python list performance:", np.array(python_list_result).mean())
print("NumPy array performance:", np.array(numpy_array_result).mean())

> ###  Opposed to python lists, NumPy arrays can contain only values of single type

In [None]:
np.array([1, 2.4, "string", False])

### Bolean subsetting is very useful for quick value search

In [None]:
a = np.array([1, 2, 4, 8, 11])
bolean_filter = a > 10
bolean_filter

In [None]:
a[bolean_filter]

In [None]:
# Replacing values based on logical operations
np.where(a > 10, 1, 0)

### Basic statistics with NumPy

In [None]:
# Generating random normal distribution 
normal_dist = np.random.normal(100, 5, 10)
# arguments in order: mean, stadard deviation and number of samples
normal_dist

In [None]:
normal_dist = np.round(normal_dist, 2)
normal_dist

In [None]:
print("Mean:", np.mean(normal_dist))
print("Median:", np.median(normal_dist))
print("Standard deviation:", np.std(normal_dist))
print("Variance:", np.var(normal_dist))

In [None]:
# Relation between two sets
normal_dist2 = np.random.normal(200, 10, 10)
print("Correlation coefficients:")
print(np.corrcoef(normal_dist, normal_dist2))

In [None]:
# Methods which are also avialable for standard python lists, but work much faster
print("Max value:", np.max(normal_dist))
print("Min value:", np.min(normal_dist))
print("Index of min value:", np.argmin(normal_dist))
print("Index of max value:", np.argmax(normal_dist))
print("Sum:", np.sum(normal_dist))
print("Product:", np.prod(normal_dist))
print("Sorted:", np.sort(normal_dist))

### Mathematical operations on NumPy arrays - it works element wise!

In [None]:
array_1 = np.array([11,12,13,14,15,16,17,18,19])
array_2 = np.array([10,20,30,40,50,60,70,80,90])

In [None]:
print("Addition:", array_1 + array_2)
print("Subtraction:", array_2 - array_1)
print("Multiplication:", array_1 * array_2)
print("Division:", array_1 / array_2)

In [None]:
array_1 + 100 

### Other useful NumPy features

In [None]:
# Generating NumPy arrays with zeros or ones
zeros = np.zeros(10)
ones = np.ones(10)

print(zeros)
print(ones)

In [None]:
ones.shape

In [None]:
ones.shape = 10, 1
ones

In [None]:
# Generasting array with constant step
np.linspace(2, 100, 8)
# arguments: start, end, number of elements

In [None]:
# dot product (iloczyn skalarny)
array_1 @ array_2

### Exercise - perform following tasks
**Given two lists created below (list_A and list_B) do following**  
  
1. Transform both lists to NumPy arrays
2. Multiply each element of array A by 37. Assing the result back to array A
3. Add 38 to each element of array B. Assing the result back to array B
4. Calculate standard deviation from array B and round the result to 1 digit after a comma. Save the result to variable x
5. Divide each element from array A by variable x. Assign the result back to array A
6. Calculate the mean from array A and round the result to 2 digits after a comma. Save the result to variable y
7. Create array C only from those elements of array A which are smaller than y. Round all elements of array C to 1 digit after a comma
8. Sum all elements of array C, multiply the result by x and divide by y. Round to two digits after a comma


In [None]:
list_A = [100, 46, 45, 82, 90]
list_B = [404, 24, 87, 99, 12]

# Pandas - Python Data Analysis Library

Docs  
https://pandas.pydata.org/  
  
**Pandas is an open source library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language**  
###  In Pandas we store data in so called Data Frame  
**Data Frame is a two-dimensional, size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns).**  
**ROWS = OBSERVATIONS**  
**COLUMNS = VARIABLES**

In [None]:
import pandas as pd

# Creating Data Frame from python dictionary
dict_data = {
    "country": ["Poland", "Germany", "Spain", "Denmark"],
    "capital": ["Warsaw", "Berlin", "Madrid", "Copenhagen"],
    "area": [312679, 357386, 505990, 42933],
    "population": [38.4, 82.8, 46.7, 5.7]
}

df = pd.DataFrame(dict_data)
df

In [None]:
# Pandas assigns index (0,1,2,3) automatically but we can set index manually
df.set_index('country')

In [None]:
# Importing data from a CSV file
df = pd.read_csv('../input/otomoto.csv')

df = df[df.make == 'Tesla'].head(10).reset_index()
# remove column 'index'
del df['index']
df

In [None]:
# Selecting specific column
price_column = df["price"]
price_column

In [None]:
type(price_column)

Series is a one dimensional array where each row is labelled

In [None]:
# Syntax for selecting specific column but in a format of a DataFrame
price_column_df = df[["price"]]
price_column_df

In [None]:
type(price_column_df)

In [None]:
# Selecting multiple columns
price_year_df = df[['price', 'year']]
price_year_df

In [None]:
# Selecting rows from a data frame where car is less expensive than 200 000 PLN
cheap_tesla = df[df.price < 200000]
cheap_tesla

In [None]:
# Using logical AND when filtering
filtering_mask = np.logical_and(df.price < 200000, df.year == 2015)
df[filtering_mask]

In [None]:
# Using logical OR when filtering
filtering_mask = np.logical_or(df.price < 200000, df.mileage < 10000)
df[filtering_mask]

In [None]:
# Iteratig over data frame with a for loop
for label, row in cheap_tesla.iterrows():
    print("Label:", label)
    print("Row values:")
    print(row)
    
# Remember! This is not very efficient

In [None]:
# Assuming we want to create new column which would be a price in EUR
df["price_EUR"] = np.round(df["price"] / 3.9, 2)

In [None]:
# We can apply any user defined function for each element with 'apply'
def my_func(x):
    return np.round(x / 3.9, 2)

df["price_EUR"] = df["price"].apply(my_func)
df

In [None]:
# Collecting basic info about a DataFrame
df = pd.read_csv('../input/otomoto.csv')
pd.options.display.float_format = '{:.2f}'.format

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# Basic info about each column
df.info()

In [None]:
# Basic statistical information about numerical columns
df.describe()

In [None]:
# See top rows of a DataFrame
df.head()

In [None]:
# See bottom rows of a DataFrame
df.tail()

In [None]:
# Sort DataFrame
df = df.sort_values("price")
df.head()

In [None]:
# Getting rid of currency column - converting all rows to PLN currency
eur_pln_ratio = 4.31
df['price'] = df.apply(lambda x: int(x['price'] * eur_pln_ratio) if x['currency'] == 'EUR' else x['price'], axis=1)
df.loc[70123]


In [None]:
# remove currency column
del df['currency']

In [None]:
# Explore prices distribution
df.price.plot('hist')
from pylab import rcParams
rcParams['figure.figsize'] = 7,7

In [None]:
df.sort_values("price", ascending=False).head(20)

In [None]:
# Reading standard deviation
np.std(df.price)

In [None]:
# delete all rows where price is bigger than 200 000 PLN
df = df[df.price < 200000]

In [None]:
# Plotting the histogram once again
df.price.plot('hist')

In [None]:
# delete all rows where price is lower than 2000 PLN
df = df[df.price > 2000]

In [None]:
# Plotting the histogram once again
df.price.plot('hist')

In [None]:
# Exploring makes
len(df.make.value_counts())

In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
df.make.value_counts()

In [None]:
df[df.make == "Samochód"]

In [None]:
# delete all rows where make is equal to 'Samochód'
df = df[df.make != 'Samochód']

In [None]:
# delete all rows with make that has less than 500 occurances
v = df[['make']]
df = df[v.replace(v.apply(pd.Series.value_counts)).gt(500).all(1)]
len(df.make.value_counts())

In [None]:
# Exploring fuel column
df.fuel.value_counts()

In [None]:
df = df[np.logical_and(df.fuel != "Wodór", df.fuel != 'Etanol')]
df.fuel.value_counts()

In [None]:
import matplotlib.pyplot as plt
# correlation between year and price
_ = plt.scatter(df['price'], df['year'])
plt.show()

In [None]:
# remove all rows where year is below 1970
df = df[df.year > 1970]

In [None]:
_ = plt.scatter(df['price'], df['year'])
plt.show()

In [None]:
# correlation between price and mileage
_ = plt.scatter(df['price'], df['mileage'])
plt.show()

In [None]:
# remove all rows where mileage is above 700 000
df = df[df.mileage < 700000]

In [None]:
# correlation between price and mileage
_ = plt.scatter(df['price'], df['mileage'])
plt.show()

In [None]:
# correlation between year and mileage
_ = plt.scatter(df['year'], df['mileage'])
plt.show()

In [None]:
# year distribution
_ = plt.hist(df['year'], bins=20)
plt.show()

In [None]:
# mileage distribution
_ = plt.hist(df['mileage'], bins=20)
plt.show()

In [None]:
# relation between price and engine volume
_ = plt.scatter(df.price, df.engine)
plt.show()

In [None]:
# remove outliner with extremaly high engine volume
df = df[df.engine < 15000]

In [None]:
# relation between price and engine volume
_ = plt.scatter(df.price, df.engine)
plt.show()

In [None]:
# How much our DataFrame was reduced
len(df)

In [None]:
# how fast car prices go down with each year of usage?

# mean price for each year
unique_years = list(set(df.year.values))
mean_prices = []
for year in unique_years:
    temp_df = df[df["year"] == year]
    mean_prices.append(int(temp_df.price.mean()))

mean_prices = np.array(mean_prices)
_ = plt.scatter(mean_prices, unique_years)
plt.show()
_ = plt.plot(mean_prices, unique_years)
plt.show()