# Introduction to Python

In [10]:
# Calculator
a = (50 + 1.45) / 12.5 # (CTRL + ENTER)
b = "hello" + " " + "world"

# print
print(a)
print(b)

4.1160000000000005
hello world


In [11]:
# Lists
x = [3, "hello", [3, 4]]
y = [1, 2, 3]
print(x + y)

[3, 'hello', [3, 4], 1, 2, 3]


In [12]:
# Flow control
for i in range(5):
    print(i)

0
1
2
3
4


In [13]:
for i in x:
    print(i)

3
hello
[3, 4]


In [14]:
i = 0
while i < 10:
    print(i)
    i += 1

0
1
2
3
4
5
6
7
8
9


In [15]:
# Functions
def plus(a, b):
    return a + b

plus(1, 2)

3

## Numpy

In [None]:
# Install library (Jupyter 7.3 or later)
# %pip install numpy  

In [None]:
# Older versions
# import sys
# !{sys.executable} -m pip install numpy

In [16]:
# Import library
import numpy as np

In [17]:
# Numpy arrays
x = np.array([3, 4, 5])
y = np.array([4, 9, 7])

print(x)
print(y)

[3 4 5]
[4 9 7]


In [18]:
# Numpy allows mathematical operations between arrays
print(x+y)

[ 7 13 12]


In [19]:
# Different from normal lists
x = [3,4,5]
y = [4,9,7]
print(x+y)

[3, 4, 5, 4, 9, 7]


In [20]:
# 2-dimensional arrays

a = np.array([[1, 2], [3, 4]])
b = np.array([[5, 6], [7, 8]])

print(a*b)

a.dtype

[[ 5 12]
 [21 32]]


dtype('int32')

In [None]:
# Documentation
np.array?

In [22]:
# Array dimensions
print(a.ndim)    # Number of dimensions
print(a.shape)   # Shape of the array

2
(2, 2)


In [23]:
# Reshaping arrays
a = np.array([[1, 2], [3, 4]])
print(a.reshape([1,4]))  # [rows, columns]

[[1 2 3 4]]


In [24]:
c = np.array([1, 2, 3, 4, 5, 6])
print(c.reshape(2, 3))    # Convert a 1d array to a 2x3 array

[[1 2 3]
 [4 5 6]]


## Indexing Arrays

In [25]:
A = np.array(np.arange(16)).reshape((4, 4))
A

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [26]:
# Select single element

print(A[1,2]) #[row, column]
# or
print(A[1][2])

6
6


In [27]:
# Select entire row/column
print(A[0,:])    # : selects everything

print(A[0])

print(A[:,0])

[0 1 2 3]
[0 1 2 3]
[ 0  4  8 12]


In [28]:
# Select multiple rows/columns
print(A[[1, 2],:])    # 2nd and 3rd row
print(A[:, [2,3]])    # 3rd and 4th column

[[ 4  5  6  7]
 [ 8  9 10 11]]
[[ 2  3]
 [ 6  7]
 [10 11]
 [14 15]]


In [29]:
# Select a submatrix
print(A[[0,1], [0,1]]) # Doesn't work, selects individual elements [0,0] and [1,1]

[0 5]


In [30]:
print(A[[0,1]][:,[0,1]]) # Select the rows first, then the columns

[[0 1]
 [4 5]]


In [31]:
print(A[0:2, 0:2]) # Or with slices

[[0 1]
 [4 5]]


## Boolean indexing

In [32]:
A = [0,1,2,3,4,5,6,7,8,9]
B = [True, False, True, False, True, False, True, False, True, False]

In [None]:
A[B]  # Error - This does not work with python arrays

In [33]:
# With numpy arrays, we select every element, where B == True
A = np.array(A)
print(A[B])

[0 2 4 6 8]


In [34]:
# We can use boolean indexing to select elements matching specific conditions
print(A[A > 5])
print(A[A % 2 == 0])
print(A[(A > 2) & (A < 7)])

[6 7 8 9]
[0 2 4 6 8]
[3 4 5 6]


In [None]:
# Also works with n-dimensional arrays
A = np.array(np.arange(16)).reshape((4, 4))
print(A)
rows = [True, False, True, False]
cols = [True, True, False, False]
A[rows][:,cols]

In [36]:
# Select rows that sum up to more than 25
high_rows = np.sum(A, axis=1) > 25
print(high_rows)
A[high_rows]

[False False  True  True]


array([[ 8,  9, 10, 11],
       [12, 13, 14, 15]])

# Pandas and Datasets

In [37]:
# In addition to Numpy arrays, we can use additional libraries to better process larger datasets
import pandas as pd

In [38]:
auto = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/mpg.csv')
auto

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


## Pandas dataframes

In [39]:
# How many examples are in a dataframe
len(auto)

398

In [40]:
# Print column names
auto.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model_year', 'origin', 'name'],
      dtype='object')

In [47]:
# Are there missing values in the dataset
any(auto.isna())
# auto.isna()

True

In [42]:
# Count the missing values per column
auto.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
name            0
dtype: int64

In [44]:
# Drop missing values
auto = auto.dropna()
print(len(auto))
print(auto.isna().sum())

392
mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
name            0
dtype: int64


In [45]:
# Drop columns
auto.drop(columns="horsepower")

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,2625,18.6,82,usa,ford ranger


### Indexing Pandas datasets

In [None]:
auto[1] # Error, cannot index by numbers 

In [48]:
print(auto['mpg'])  # Index by column names instead

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 392, dtype: float64


In [50]:
auto[:3]     # Or use slices to select rows

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite


In [53]:
auto.loc[0,"mpg"] # Or with .loc[row_numebr, column_name]

18.0

In [52]:
print(auto.iloc[0,0])    # Or with .iloc[row_number, column_number]

18.0


In [55]:
(auto.loc[0:5, ["mpg", "cylinders"]])  # Can also select multiple rows and columns

Unnamed: 0,mpg,cylinders
0,18.0,8
1,15.0,8
2,18.0,8
3,16.0,8
4,17.0,8
5,15.0,8


We can use logical indexing to select specific rows

In [None]:
idx_80 = auto['model_year'] > 80
(auto.loc[idx_80, ['name', 'model_year']])

In [None]:
# Or in one line
(auto.loc[auto['model_year'] > 80, ['name', 'model_year']])

## Adding values

In [59]:
# Create new column
auto['Horsepower / Weight'] = auto['horsepower'] / auto['weight']
auto

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,Horsepower / Weight
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu,0.037100
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320,0.044679
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite,0.043655
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst,0.043694
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino,0.040591
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl,0.030824
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup,0.024413
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage,0.036601
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger,0.030095


In [60]:
# Create a new empty row as a pd.Series
new_row = pd.Series([None]*10, index=['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
                                      'acceleration', 'model_year', 'origin', 'name', 'Horsepower / Weight'])

# Concatenate the new empty row with the original DataFrame
result = pd.concat([auto, new_row.to_frame().T], ignore_index=True)
result

  result = pd.concat([auto, new_row.to_frame().T], ignore_index=True)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,Horsepower / Weight
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu,0.037100
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320,0.044679
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite,0.043655
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst,0.043694
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino,0.040591
...,...,...,...,...,...,...,...,...,...,...
388,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup,0.024413
389,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage,0.036601
390,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger,0.030095
391,31.0,4,119.0,82.0,2720,19.4,82,usa,chevy s-10,0.030147


## Additional functions

In [61]:
# Merge datasets
pd.concat([auto, auto])

# This can also be used to add new rows - simply create a new dataframe and concat it to the original

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,Horsepower / Weight
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu,0.037100
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320,0.044679
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite,0.043655
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst,0.043694
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino,0.040591
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl,0.030824
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup,0.024413
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage,0.036601
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger,0.030095


In [62]:
# Save dataset
auto.to_csv('./is_lab_1_dataset.txt', index=False)