In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [2]:
# 2 main datatypes
series = pd.Series(["BMW", "Toyota", "Honda"])

In [3]:
series

0       BMW
1    Toyota
2     Honda
dtype: object

In [4]:
# series = 1-dimensional

In [5]:
colours = pd.Series(["Red", "Blue", "White"])
colours

0      Red
1     Blue
2    White
dtype: object

In [6]:
# DataFrame = 2-dimensional
car_data = pd.DataFrame({"Car make": series, "Colour": colours})
car_data

Unnamed: 0,Car make,Colour
0,BMW,Red
1,Toyota,Blue
2,Honda,White


In [7]:
# Import data
car_sales = pd.read_csv("car-sales.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'car-sales.csv'

In [None]:
car_sales

![](pandas-anatomy-of-a-dataframe.png) 

In [None]:
# Exporting a dataframe
car_sales.to_csv("exported-car-sales.csv") # index will also be exported
export_car_sales = pd.read_csv("exported-car-sales.csv")
export_car_sales

In [None]:
car_sales.to_csv("exported-car-sales.csv", index=False)
export_car_sales = pd.read_csv("exported-car-sales.csv")
export_car_sales

## Describe data

In [None]:
# Attribute
car_sales.dtypes

# Function
#car_sales.to_csv()

In [None]:
car_sales.columns

In [None]:
car_columns = car_sales.columns
car_columns

In [None]:
car_sales.index

In [None]:
car_sales

In [None]:
car_sales.describe()

In [None]:
car_sales.info()

In [None]:
car_sales.mean()

In [None]:
car_prices = pd.Series([3000, 1500, 111250])
car_prices.mean()

In [None]:
car_sales.sum()

In [None]:
car_sales["Doors"].sum()

In [None]:
len(car_sales)

In [None]:
car_sales

## Viewing and selecting data

In [None]:
car_sales.head()

In [None]:
car_sales.head(7)

In [None]:
car_sales.tail()

In [None]:
car_sales.tail(3)

In [None]:
# .loc & iloc
animals = pd.Series(["cat", "dog", "bird", "panda", "snake"],
                   index=[0, 3, 9, 8, 3])
animals

In [None]:
# .loc refers to index
animals.loc[3]

In [None]:
animal.loc[9]

In [None]:
car_sales.loc[3]

In [None]:
# .iloc refers to position
animals.iloc[3]

In [None]:
car_sales

In [None]:
car_sales.iloc[3]

In [None]:
animals

In [None]:
animals.iloc[:3]

In [None]:
car_sales.loc[:3]

In [None]:
car_sales.head(4)

In [None]:
car_sales["Make"]

In [None]:
car_sales["Colour"]

In [None]:
car_sales["Make"]

In [None]:
car_sales.Make

In [None]:
car_sales[car_sales["Make"] == "Toyota"]

In [None]:
car_sales[car_sales[ "Odometer (KM)"] > 100000 ]

In [None]:
pd.crosstab(car_sales["Make"], car_sales["Doors"])

In [None]:
# Groupby
car_sales.groupby(["Make"]).mean()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
car_sales["Odometer (KM)"].plot()

In [None]:
car_sales["Odometer (KM)"].hist()

In [None]:
car_sales["Price"].dtype

In [None]:
#car_sales["Price"].plot()

In [None]:
car_sales = pd.read_csv("car-sales.csv")
car_sales

In [None]:
car_sales["Price"] = car_sales["Price"].str.replace('[\$\,]', '').astype(float)

In [None]:
car_sales

In [None]:
car_sales["Price"].plot()

## Manipulating Data

In [None]:
car_sales["Make"].str.lower()

In [None]:
car_sales

In [None]:
car_sales["Make"] = car_sales["Make"].str.lower()

In [None]:
car_sales

In [None]:
car_sales

In [None]:
car_sales_missing = pd.read_csv("car-sales-missing-data.csv")
car_sales_missing

In [None]:
car_sales_missing["Odometer"].mean()

# fill the NaN missing values with the mean of the current accessible values
car_sales_missing["Odometer"].fillna(car_sales_missing["Odometer"].mean())

In [None]:
car_sales_missing["Odometer"].fillna(car_sales_missing["Odometer"].mean(),
                                     inplace=True)

In [None]:
car_sales_missing

In [None]:
car_sales_missing.dropna()

In [None]:
car_sales_missing.dropna(inplace=True)
car_sales_missing

In [None]:
car_sales_missing = pd.read_csv("car-sales-missing-data.csv")
car_sales_missing

In [None]:
car_sales_missing_dropped = car_sales_missing.dropna()
car_sales_missing_dropped 

In [None]:
car_sales_missing_dropped.to_csv("car-sales-missing-dropped.csv")

In [None]:
# Column from series
seats_column = pd.Series([5, 5, 5, 5, 5])

# New column called seats
car_sales["Seats"] = seats_column
car_sales

In [None]:
car_sales["Seats"].fillna(5, inplace=True)
car_sales

In [None]:
# Column from Python list
fuel_economy = [7.5, 9.2, 3.5, 5.0, 8.7, 3.2, 8.8, 2.5, 9.9, 1.0]
car_sales["Fuel per 100KM"] = fuel_economy
car_sales

In [None]:
 car_sales["Total fuel used (L)"] = car_sales["Odometer (KM)"]/100 * car_sales["Fuel per 100KM"]

In [None]:
car_sales

In [None]:
# Create a colum from a single value
car_sales["Number of wheels"] = 4
car_sales

In [None]:
car_sales["Passed road safety"] = True
car_sales.dtypes

In [None]:
car_sales

In [None]:
car_sales.drop("Total fuel used", axis=1, inplace=True)

![](pandas-anatomy-of-a-dataframe.png)

In [None]:
car_sales

In [None]:
car_sales_shuffled = car_sales.sample(frac=1) # to shuffle the order of the elements in the df

In [None]:
car_sales_shuffled

In [None]:
# Only select 20% ofo data
car_sales_shuffled.sample(frac=0.2)

In [None]:
car_sales_shuffled.reset_index()

In [None]:
car_sales_shuffled.reset_index(drop=True, inplace=True)

In [None]:
car_sales_shuffled

In [None]:
car_sales

In [None]:
# Lambda is a keyword inPython which is short for an anonymous function
car_sales["Odometer (KM)"] = car_sales["Odometer (KM)"].apply(lambda x: x/1.6) # apply this function to x by x divided by 1.6
car_sales

In [None]:
# Try it, run your code
# Search for it
# Try again
# Ask