## 1. Load the data from a .csv file.

In [1]:
import pandas as pd
import numpy as np

# We load Google stock data in a DataFrame
Google_stock = pd.read_csv('./GOOG.csv')

# We print some information about Google_stock
print('Google_stock is of type:', type(Google_stock))
print('Google_stock has shape:', Google_stock.shape)

Google_stock is of type: <class 'pandas.core.frame.DataFrame'>
Google_stock has shape: (3313, 7)


## 2. Look at the first few rows of the DataFrame

In [2]:
Google_stock

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2004-08-19,49.676899,51.693783,47.669952,49.845802,49.845802,44994500
1,2004-08-20,50.178635,54.187561,49.925285,53.805050,53.805050,23005800
2,2004-08-23,55.017166,56.373344,54.172661,54.346527,54.346527,18393200
3,2004-08-24,55.260582,55.439419,51.450363,52.096165,52.096165,15361800
4,2004-08-25,52.140873,53.651051,51.604362,52.657513,52.657513,9257400
...,...,...,...,...,...,...,...
3308,2017-10-09,980.000000,985.424988,976.109985,977.000000,977.000000,891400
3309,2017-10-10,980.000000,981.570007,966.080017,972.599976,972.599976,968400
3310,2017-10-11,973.719971,990.710022,972.250000,989.250000,989.250000,1693300
3311,2017-10-12,987.450012,994.119995,985.000000,987.830017,987.830017,1262400


## 3. Look at the first 5 rows of the DataFrame

In [3]:
Google_stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2004-08-19,49.676899,51.693783,47.669952,49.845802,49.845802,44994500
1,2004-08-20,50.178635,54.187561,49.925285,53.80505,53.80505,23005800
2,2004-08-23,55.017166,56.373344,54.172661,54.346527,54.346527,18393200
3,2004-08-24,55.260582,55.439419,51.450363,52.096165,52.096165,15361800
4,2004-08-25,52.140873,53.651051,51.604362,52.657513,52.657513,9257400


## 4. Look at the last 5 rows of the DataFrame

In [4]:
Google_stock.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
3308,2017-10-09,980.0,985.424988,976.109985,977.0,977.0,891400
3309,2017-10-10,980.0,981.570007,966.080017,972.599976,972.599976,968400
3310,2017-10-11,973.719971,990.710022,972.25,989.25,989.25,1693300
3311,2017-10-12,987.450012,994.119995,985.0,987.830017,987.830017,1262400
3312,2017-10-13,992.0,997.210022,989.0,989.679993,989.679993,1157700


## 5. Check if any column contains a NaN. Returns a boolean for each column label

In [5]:
Google_stock.isnull().any()

Date         False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool

## 6. See the descriptive statistics of the DataFrame

In [6]:
# We get descriptive statistics on our stock data
Google_stock.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0
mean,380.186092,383.49374,376.519309,380.072458,380.072458,8038476.0
std,223.81865,224.974534,222.473232,223.85378,223.85378,8399521.0
min,49.274517,50.541279,47.669952,49.681866,49.681866,7900.0
25%,226.556473,228.394516,224.003082,226.40744,226.40744,2584900.0
50%,293.312286,295.433502,289.929291,293.029114,293.029114,5281300.0
75%,536.650024,540.0,532.409973,536.690002,536.690002,10653700.0
max,992.0,997.210022,989.0,989.679993,989.679993,82768100.0


## 7. See the descriptive statistics of one of the columns of the DataFrame

In [7]:
# We get descriptive statistics on a single column of our DataFrame
Google_stock['Adj Close'].describe()

count    3313.000000
mean      380.072458
std       223.853780
min        49.681866
25%       226.407440
50%       293.029114
75%       536.690002
max       989.679993
Name: Adj Close, dtype: float64

## 8. Statistical operations - Min, Max, and Mean

In [12]:
# We print information about our DataFrame  
print()
print('Maximum values of each column:\n', Google_stock.max())
print()
print('Minimum Close value:', Google_stock['Close'].min())
print()
print('Average value of each column:\n', Google_stock.mean(numeric_only=True))


Maximum values of each column:
 Date         2017-10-13
Open              992.0
High         997.210022
Low               989.0
Close        989.679993
Adj Close    989.679993
Volume         82768100
dtype: object

Minimum Close value: 49.681866

Average value of each column:
 Open         3.801861e+02
High         3.834937e+02
Low          3.765193e+02
Close        3.800725e+02
Adj Close    3.800725e+02
Volume       8.038476e+06
dtype: float64


## 9. Statistical operation - Correlation

In [14]:
# We display the correlation between columns
Google_stock.corr(numeric_only = True)

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
Open,1.0,0.999904,0.999845,0.999745,0.999745,-0.564258
High,0.999904,1.0,0.999834,0.999868,0.999868,-0.562749
Low,0.999845,0.999834,1.0,0.999899,0.999899,-0.567007
Close,0.999745,0.999868,0.999899,1.0,1.0,-0.564967
Adj Close,0.999745,0.999868,0.999899,1.0,1.0,-0.564967
Volume,-0.564258,-0.562749,-0.567007,-0.564967,-0.564967,1.0


## 10. Demonstrate groupby() and sum() method

In [16]:
# We load fake Company data in a DataFrame
data = pd.read_csv('./fake_company.csv')

data

# We display the total amount of money spent in salaries each year
data.groupby(['Year'])['Salary'].sum()

Year
1990    153000
1991    162000
1992    174000
Name: Salary, dtype: int64

## 11. Demonstrate groupby() and mean() method

In [17]:
# We display the average salary per year
data.groupby(['Year'])['Salary'].mean()

Year
1990    51000.0
1991    54000.0
1992    58000.0
Name: Salary, dtype: float64

## 12. Demonstrate groupby() on single column

In [18]:
# We display the total salary each employee received in all the years they worked for the company
data.groupby(['Name'])['Salary'].sum()

Name
Alice      50000
Bob        48000
Charlie    55000
Dakota     52000
Elsa       50000
Frank      60000
Grace      60000
Hoffman    52000
Inaar      62000
Name: Salary, dtype: int64

## 13. Demonstrate groupby() on two columns

In [19]:
# We display the salary distribution per department per year.
data.groupby(['Year', 'Department'])['Salary'].sum()

Year  Department
1990  Admin          55000
      HR             50000
      RD             48000
1991  Admin          60000
      HR             52000
      RD             50000
1992  Admin         122000
      RD             52000
Name: Salary, dtype: int64