In [1]:
import numpy as np
import pandas as pd

If you get an error, run the following command in a separate cell:

> conda install pandas

# Dataframes from scratch

In [None]:
dico = {"name": ["Ben", "Max", "John", "Ema", "Alice"],
        "age" : [25,20,18,10,33],
        "gender": ["m", "m", "m", "f", "f"]}
dico

In [None]:
pd.DataFrame(dico)

In [None]:
x = pd.Series(["Ben", "Max", "John", "Ema", "Alice"], name="name")
x

In [None]:
y = pd.Series([25,20,18,10,33], name="age")
z = pd.Series(["m", "m", "m", "f", "f"], name="gender")

pd.concat([x, y ,z], axis=1)

## Exercises

* Create your own data frame with 2 columns and 100 rows:
    * first column should be a random name: Ben, Tom of John (use np.random.choice)
    * second column should be a random age between 20 and 40 
* print the last 6 rows

# Manipulating data

## Import files

In [5]:
df = pd.read_csv("vgsales.csv")

In [6]:
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,259,Asteroids,2600,1980,Shooter,Atari,4.0,0.26,0.0,0.05,4.31
1,545,Missile Command,2600,1980,Shooter,Atari,2.56,0.17,0.0,0.03,2.76
2,1768,Kaboom!,2600,1980,Misc,Activision,1.07,0.07,0.0,0.01,1.15
3,1971,Defender,2600,1980,Misc,Atari,0.99,0.05,0.0,0.01,1.05
4,2671,Boxing,2600,1980,Fighting,Activision,0.72,0.04,0.0,0.01,0.77


In [None]:
df.tail(3)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df["Global_Sales"].head()

In [None]:
print(type(df))
print(type(df["Global_Sales"]))
print(type(df["Global_Sales"].values))

In [None]:
df[["Name", "Rank"]].head()

In [None]:
df.iloc[:5]

In [None]:
df.iloc[:5,:3]

In [None]:
df.loc[:4,:"EU_Sales"]

## statistics

In [None]:
df["Global_Sales"].mean()

In [None]:
df["Global_Sales"].min()

In [None]:
df["Global_Sales"].var()

In [None]:
df.describe()

In [None]:
df["Platform"].value_counts()

## SQL like operations: filter, groupby, sort, joins

In [None]:
df["Platform"] == "PS4"

In [None]:
df[df["Platform"] == "PS4"].head(10)

In [22]:
df.sort_values(["Year", "Platform"], ascending=False).head(5)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
15984,847,Tom Clancy's The Division,XOne,2016,Shooter,Ubisoft,1.2,0.62,0.0,0.18,2.01
15988,1226,FIFA 17,XOne,2016,Sports,Electronic Arts,0.17,1.26,0.0,0.1,1.53
15998,2436,Far Cry: Primal,XOne,2016,Action,Ubisoft,0.46,0.32,0.0,0.07,0.85
15999,2446,Overwatch,XOne,2016,Shooter,Activision,0.52,0.25,0.0,0.08,0.85
16002,2507,Madden NFL 17,XOne,2016,Sports,Electronic Arts,0.72,0.02,0.0,0.09,0.82


In [None]:
df[["Platform", "Global_Sales"]].groupby("Platform").mean().reset_index()

In [None]:
df1 = pd.DataFrame({'id': [1,2,4,5], 'var1': [22,21,18,30]})
df2 = pd.DataFrame({'id': [1,2,3,5], 'var2': [1,1,3,2]})

display(df1, df2)

In [None]:
pd.merge(df1, df2, how="left", on="id")

In [None]:
pd.merge(df1, df2, how="inner", on="id")

In [None]:
pd.merge(df1, df2, how="outer", on="id")

## Exercise

* select the 10 ranked video games and keep only the following columns: Name, Year, Genre, Publisher, EU_Sales
* what do they have in common ?

* select top 5 video games from PS4 or XOne with highest global sales in descending order

## data cleaning

disclaimer: data cleaning and preparation is a broad topic and cannot be covered in a few hours only.

I show here the most basic steps, but you would have to look at the documentation when you will work on complex data

In [None]:
df.isna().sum()

In [None]:
df = df.fillna("missing") #missing values
df.isna().sum()

In [None]:
df = df.drop_duplicates() #in case there are duplicates

In [None]:
df["Name"].str.upper().head()

In [14]:
df["Name"].str.split(" ", n=1, expand=True).fillna("").head(10)

Unnamed: 0,0,1
0,Asteroids,
1,Missile,Command
2,Kaboom!,
3,Defender,
4,Boxing,
5,Ice,Hockey
6,Freeway,
7,Bridge,
8,Checkers,
9,Pitfall!,


In [17]:
df["platform_genre"] = df["Platform"] + "-" + df["Genre"]
df["platform_genre"].head()

0     2600-Shooter
1     2600-Shooter
2        2600-Misc
3        2600-Misc
4    2600-Fighting
Name: platform_genre, dtype: object

In [None]:
df["blockbuster"] = df["Global_Sales"] > 10
df.head()

In [None]:
df[df["blockbuster"]].sort_values("Name")

## data reshaping

In [None]:
# pivot table as in Excel for analysis
df.pivot_table(index="Platform", columns="Genre", values = "Global_Sales", aggfunc="sum")

In [None]:
df.reset_index()

In [None]:
# wide to long, needed for visualization purposes
df.melt(id_vars=['Platform', 'Genre'], value_vars=["EU_Sales", "Global_Sales"])

# Exercises

* Import the netflix dataset
* display info about the dataset (nrows, columns, etc)

* Check if there are missing values in columns
* identify columns with more that 9% missing values and drop them
* also remove the show_id columns

* Answer the following questions:
    * are there more movies or TV shows?
    * what are the 3 countries producing the more movies?
    * what is the longest and shortest film?
* (Optional) List TV-shows that have more than 10 seasons?
