In [1]:
import pandas as pd

In [4]:
# Creating data type: Series

# providing a list as data argument, default labels
my_series_list = pd.Series([1, 2, 3])
print(my_series_list) # default labels are integers 0, 1, 2, ...

# providing a list as data argument, custom labels
my_series_list_custom = pd.Series([1, 2, 3], index = ["a", "b", "c"])
print(my_series_list_custom) # to make custom labels provide a second "index argument"

# providing a dictionary as data argument
my_series_dict = pd.Series({"a": 1, "b": 2, "c":3})
print(my_series_dict) # keys become labels


0    1
1    2
2    3
dtype: int64
a    1
b    2
c    3
dtype: int64
a    1
b    2
c    3
dtype: int64


In [18]:
# Creating data type: DataFrame

# data frame from a series
my_df_series = pd.DataFrame(my_series_list)
print(my_df_series) # contains only one column

# data frame from a dictionary of series
my_df_series_2 = pd.DataFrame({"a": my_series_list, "b": my_series_list})
print(my_df_series_2)

# data frame from a dictionary
my_df_dict = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c":[5, 6]})
print(my_df_dict)

# custom row labels with the "index" argument
my_df_labels = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c":[5, 6]}, index = ["first_row", "second_row"])
print(my_df_labels)

# selecting columns with the "colums" argument
my_df_columns = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c":[5, 6]}, columns = ["a", "b"])
print(my_df_columns)

# renaming columns with the "rename()" method
my_df_columns.rename(columns = {'a':'new_a', 'b':'new_b'}, inplace = True)
print(my_df_columns)

# renaming columns with the "columns()" attribute
my_df_columns.columns = ["newest_a", "newest_b"]
print(my_df_columns)


   0
0  1
1  2
2  3
   a  b
0  1  1
1  2  2
2  3  3
   a  b  c
0  1  3  5
1  2  4  6
            a  b  c
first_row   1  3  5
second_row  2  4  6
   a  b
0  1  3
1  2  4
   new_a  new_b
0      1      3
1      2      4
   newest_a  newest_b
0         1         3
1         2         4


In [23]:
# Loading files into a DataFrame: CSV

my_csv = pd.read_csv("my_file.csv", names = ["A", "B"])
print(my_csv) # The parameter "names" is optional and used to specify which columns to load from the file


   A  B
0  A  B
1  1  a
2  2  b
3  3  c
4  4  d
5  5  e


In [25]:
# Loading files into a DataFrame: JSON

my_json = pd.read_json("my_json.json")
print(my_json)


   Column_A Column_B
0         1        a
1         2        b
2         3        c


In [46]:
# Accessing values in a Series
print(my_series_list_custom)

# You can access values by their label names
x = my_series_list_custom["b"]
print(x)

# Or you can access values by their index position
y = my_series_list_custom[2]
print(y)

# You can use the square brackets notation to set values
my_series_list_custom[0] = 100
print(my_series_list_custom)

# You can use the semicolon inside the square brackets to select ranges of values
z = my_series_list_custom[0:2]
print(z)

# This is what will happen if you won't supply the end index
o = my_series_list_custom[1:]
print(o) # [2, 3]

# This is what will happen if you won't supply the start index
e = my_series_list_custom[:2]
print(e) # [100,2]

# Using logic inside the sqare brackets notation
b = my_series_list_custom[my_series_list_custom > 50]
print(b) # 100

w = my_series_list_custom[my_series_list_custom  != 100]
print(w) # [2, 3]

a = my_series_list_custom[(my_series_list_custom  != 100) | (my_series_list_custom != 2)]
print(a) # [2, 3]

a    100
b      2
c      3
dtype: int64
2
3
a    100
b      2
c      3
dtype: int64
a    100
b      2
dtype: int64
b    2
c    3
dtype: int64
a    100
b      2
dtype: int64
a    100
dtype: int64
b    2
c    3
dtype: int64
a    100
b      2
c      3
dtype: int64


In [122]:
# Accessing values in a DataFrame

test_df = pd.DataFrame({"col_a": [1, 2, 3], "col_b": ["a", "b", "c"]})

# Output a single column selected by its name
print(test_df["col_a"]) 

# Output multiple columns selected by their name (provide a list)
print(test_df[["col_a", "col_b"]]) 

# Accessing rows using the .loc[] attribute
print(test_df.loc[0]) # col_a    1, col_b    a
print(test_df.loc[1]) # col_a    2, col_b    b
print(test_df.loc[2]) # col_a    3, col_b    c

# The loc attribute can be used for setting values
test_df.loc[1] = 200
print(test_df)

# Returning multiple rows
print(test_df.loc[1:2])

# Accessing labelled rows
print(my_df_labels)
print(my_df_labels.loc["second_row"])

# Using the .iloc[] attribute
print(test_df.iloc[0])
print(test_df.loc[0])

# loc vs iloc
print(my_df_labels)
print(my_df_labels.loc["first_row"])
print(my_df_labels.iloc[0])

# Using two arguments in .loc[] attribute
print(test_df)
print(test_df.loc[0:1, "col_a"])         # Returns a Series
print(test_df.loc[0:1, "col_a":"col_b"]) # Returns a DataFrame

print(my_df_labels)
print(my_df_labels.loc["first_row":"second_row", "a":"b"])
# is the same as:
print(my_df_labels.iloc[0:2, 0:2])

# You can use semicolons or provide lists as first two arguments
print(my_df_labels.iloc[0:2, 0:2])
# is the same as:
print(my_df_labels.iloc[[0, 1], [0, 1]])

print(my_df_labels)
print(my_df_labels.iloc[[0, 1], :])
print(my_df_labels.iloc[[0, 1], ])

# What are .at[] and .iat[] attributes?
print(my_df_labels.iloc[1, 1])
print(my_df_labels.iat[1, 1])

print(type(my_df_labels.iloc[1, 1]))
print(type(my_df_labels.iat[1, 1]))

# at
print(my_df_labels.at["second_row", "b"])

# iat
print(my_df_labels.iat[1, 1])

# Using logic inside the square bracket notation
print(my_df_labels[my_df_labels["a"] == 2])
print(my_df_labels[my_df_labels > 2])

# Can we add a new row with simple assignment?
print()

0    1
1    2
2    3
Name: col_a, dtype: int64
   col_a col_b
0      1     a
1      2     b
2      3     c
col_a    1
col_b    a
Name: 0, dtype: object
col_a    2
col_b    b
Name: 1, dtype: object
col_a    3
col_b    c
Name: 2, dtype: object
   col_a col_b
0      1     a
1    200   200
2      3     c
   col_a col_b
1    200   200
2      3     c
            a  b  c
first_row   1  3  5
second_row  2  4  6
a    2
b    4
c    6
Name: second_row, dtype: int64
col_a    1
col_b    a
Name: 0, dtype: object
col_a    1
col_b    a
Name: 0, dtype: object
            a  b  c
first_row   1  3  5
second_row  2  4  6
a    1
b    3
c    5
Name: first_row, dtype: int64
a    1
b    3
c    5
Name: first_row, dtype: int64
   col_a col_b
0      1     a
1    200   200
2      3     c
0      1
1    200
Name: col_a, dtype: int64
   col_a col_b
0      1     a
1    200   200
            a  b  c
first_row   1  3  5
second_row  2  4  6
            a  b
first_row   1  3
second_row  2  4
            a  b
first_row   

In [163]:
# Data Manipulation

# Adding a row to a DataTable
print(test_df)

n_row = pd.DataFrame({"col_a": [4, 8], "col_b": ["d", "e"]})

new_df = pd.concat([test_df, n_row])
print(new_df)

test_df = pd.concat([test_df, n_row])
print(test_df)

# Can we add a new row with simple assignment? Yes
print(my_df_labels)

my_df_labels.loc["third_row"] = {"a":3, "b":5, "c":7}
print(my_df_labels)

# Adding a new column to a DataTable
n_row["col_c"] = pd.DataFrame({"col_c": [True]})
n_row["col_d"] = 100
n_row["col_d"] = [1, 2]
print(n_row)

   col_a col_b
0      1     a
1    200   200
2      3     c
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
1      8     e
0      4     d
1      8     e
   col_a col_b
0      1     a
1    200   200
2      3     c
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
1      8     e
0      4     d
1      8     e
0      4     d
1      8     e
   col_a col_b
0      1     a
1    200   200
2      3     c
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
0      4     d
1      8     e
0      4  

In [169]:
# Data Manipulation: From wide to narrow format

df_wide = pd.DataFrame({"Person": ["Bob", "Alice", "Steve"], "Age": [32, 24, 64], "Weight": [75, 66, 102], "Height": [180, 175, 165]})
print(df_wide)

print(pd.melt(df_wide, id_vars = "Person"))

  Person  Age  Weight  Height
0    Bob   32      75     180
1  Alice   24      66     175
2  Steve   64     102     165
  Person variable  value
0    Bob      Age     32
1  Alice      Age     24
2  Steve      Age     64
3    Bob   Weight     75
4  Alice   Weight     66
5  Steve   Weight    102
6    Bob   Height    180
7  Alice   Height    175
8  Steve   Height    165


In [176]:
# Data Manipulation: From narrow to wide
df_narrow = pd.DataFrame({"Person": ["Bob", "Bob", "Bob", "Alice", "Alice", "Alice", "Steve", "Steve", "Steve"],
                         "Variable": ["Age", "Weigth", "Height", "Age", "Weigth", "Height", "Age", "Weigth", "Height"],
                         "Value": [32, 75, 180, 24, 66, 175, 64, 102, 165]})
#print(df_narrow)

print(df_narrow.pivot(index = ["Person"], columns = ["Variable"], values = ["Value"]))

         Value              
Variable   Age Height Weigth
Person                      
Alice       24    175     66
Bob         32    180     75
Steve       64    165    102


In [208]:
# Defining sample data frames for join operations

df_1 = pd.DataFrame({"var_1": ["A", "B", "C"], "var_2": [1, 2, 3]})
df_2 = pd.DataFrame({"var_1": ["A", "B", "D"], "var_3": [True, False, True]})

print(df_1)
print(
)
print(df_2)
print(
)


  var_1  var_2
0     A      1
1     B      2
2     C      3

  var_1  var_3
0     A   True
1     B  False
2     D   True



In [209]:
# Inner join

print(pd.merge(df_1, df_2, how = "inner", on = "var_1")) # on parameter is optional it asks for a shared variable
#

  var_1  var_2  var_3
0     A      1   True
1     B      2  False


In [210]:
# Left (Outer) join
print(pd.merge(df_1, df_2, how = "left"))
#

  var_1  var_2  var_3
0     A      1   True
1     B      2  False
2     C      3    NaN


In [213]:
# Right (Outer) join
print(pd.merge(df_1, df_2, how = "right"))
#

  var_1  var_2  var_3
0     A    1.0   True
1     B    2.0  False
2     D    NaN   True


In [212]:
# Full (Outer) join
print(pd.merge(df_1, df_2, how = "outer"))
#

  var_1  var_2  var_3
0     A    1.0   True
1     B    2.0  False
2     C    3.0    NaN
3     D    NaN   True
