# Introduction to Numpy/Pandas


[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/UCL-DSPP-2425/ECON0128-lectures/blob/main/09_python/intro_pandas.ipynb)

## numpy

In [4]:
# load numpy package into the namespace
import numpy as np

In [5]:
dir(np)

['ALLOW_THREADS',
 'BUFSIZE',
 'CLIP',
 'DataSource',
 'ERR_CALL',
 'ERR_DEFAULT',
 'ERR_IGNORE',
 'ERR_LOG',
 'ERR_PRINT',
 'ERR_RAISE',
 'ERR_WARN',
 'FLOATING_POINT_SUPPORT',
 'FPE_DIVIDEBYZERO',
 'FPE_INVALID',
 'FPE_OVERFLOW',
 'FPE_UNDERFLOW',
 'False_',
 'Inf',
 'Infinity',
 'MAXDIMS',
 'MAY_SHARE_BOUNDS',
 'MAY_SHARE_EXACT',
 'NAN',
 'NINF',
 'NZERO',
 'NaN',
 'PINF',
 'PZERO',
 'RAISE',
 'SHIFT_DIVIDEBYZERO',
 'SHIFT_INVALID',
 'SHIFT_OVERFLOW',
 'SHIFT_UNDERFLOW',
 'ScalarType',
 'True_',
 'UFUNC_BUFSIZE_DEFAULT',
 'UFUNC_PYVALS_NAME',
 'WRAP',
 '_CopyMode',
 '_NoValue',
 '_UFUNC_API',
 '__NUMPY_SETUP__',
 '__all__',
 '__builtins__',
 '__cached__',
 '__config__',
 '__deprecated_attrs__',
 '__dir__',
 '__doc__',
 '__expired_functions__',
 '__file__',
 '__former_attrs__',
 '__future_scalars__',
 '__getattr__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_add_newdoc_ufunc',
 '_builtins',
 '_distributor_init',
 '_financial_names',
 '_ge

In [6]:
# build a 1D array
# NOT lists!!! like R vectors with single data type
a = np.array([1, 2, 3, 4, 5])

# build a 2D array
b = np.array([[1, 2, 8], [4, 5, 6], [7, 8, 10]])

# build a 3D array
c = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])

In [7]:
# get the shape of the array

print(a.shape)

(5,)


In [8]:
# get the type of the array # integer

print(a.dtype)

int64


In [9]:
# convert the type of the array

print(a.astype(float))

[1. 2. 3. 4. 5.]


In [10]:
# math operations

print(a + 1)
print(a - 1)
print(a * 2)
print(a / 2)

[2 3 4 5 6]
[0 1 2 3 4]
[ 2  4  6  8 10]
[0.5 1.  1.5 2.  2.5]


In [11]:
# matrix operations

print(np.dot(b, b))

b_inv = np.linalg.inv(b)

print(b_inv)
print(np.dot(b, b_inv))

print(np.linalg.det(b))

[[ 65  76 100]
 [ 66  81 122]
 [109 134 204]]
[[-0.11111111 -2.44444444  1.55555556]
 [-0.11111111  2.55555556 -1.44444444]
 [ 0.16666667 -0.33333333  0.16666667]]
[[ 1.00000000e+00 -4.44089210e-16  2.22044605e-16]
 [-2.22044605e-16  1.00000000e+00 -2.22044605e-16]
 [ 0.00000000e+00 -9.99200722e-16  1.00000000e+00]]
-17.999999999999996


In [12]:
# apply functions to arrays

print(np.sin(a))
print(np.exp(a))
print(np.log(a))

[ 0.84147098  0.90929743  0.14112001 -0.7568025  -0.95892427]
[  2.71828183   7.3890561   20.08553692  54.59815003 148.4131591 ]
[0.         0.69314718 1.09861229 1.38629436 1.60943791]


In [13]:
print(a.mean()) # mean of array a
print(a.std())

print(b.mean())
print(b.mean(axis=0)) # mean along the columns
print(b.mean(axis=1)) # mean along the rows

3.0
1.4142135623730951
5.666666666666667
[4. 5. 8.]
[3.66666667 5.         8.33333333]


## pandas

In [None]:
# load pandas into memory
import pandas as pd

In [None]:
!pip install pandas

In [None]:
# what are we importing?
pd.

In [None]:
# create a dataframe from a dictionary (countries and currency)
my_dictionary = {"country_name": ["Spain", "Colombia", "Turkey"], "currency": ["Euro", "Peso", "Lira"]}
my_dictionary

In [None]:
df = pd.DataFrame(my_dictionary)
df

In [None]:
# read data from a csv file (Colab sample files: sample_data/california_housing_train.csv)
df = pd.read_csv("sample_data/california_housing_train.csv")
df

In [None]:
# reading data from other file types (JSON)
df_json = pd.read_json("sample_data/anscombe.json")
df_json

In [None]:
# reading data from other file types (Excel, Stata). Data files are not available
#df_excel = pd.read_excel("data/iris.xlsx", sheet_name="Sheet 1")
#df_stata = pd.read_stata("data/iris.dta")

In [None]:
# read data from file with a different separator. Data file not available
#df = pd.read_csv("data/iris.txt", sep="\t")

### Working with DataFrames

In [None]:
df

In [None]:
# describe dataframe
df.describe()

In [None]:
# get all the column names from a dataframe
df.columns

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# select a column using its name
df["population"]

In [None]:
# select a row using its index

In [None]:
df.loc[2]

In [None]:
# select a particular cell
df.loc[2, "population"]

In [None]:
# filter data
df.loc[df["population"] > 1000]

In [None]:
df.loc[df["households"] > 1000]

In [None]:
df.loc[(df["population"] > 1000) & (df["households"] > 1000)]

In [None]:
# sum all elements of a column (what is the total population?)
df["population"].sum()

In [None]:
df["population"].mean()

In [None]:
# apply a custom function to all elements of a column
df["population"].apply(lambda x: x**2 + 10)

In [None]:
# create a new column
df["population_new"] = df["population"].apply(lambda x: x**2 + 10)
df

In [None]:
df["rooms_per_person"] = df["total_rooms"] / df["population"]
df

### repeat data science first program

In [None]:
# read data from a public URL
url = f"https://www.dropbox.com/s/xk3vm6l8jaw2k9o/income_by_year_age.csv?dl=1"
df = pd.read_csv(url)
df

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df["16-24"].head()

In [None]:
print(df["16-24"].mean())
print(df["16-24"].std())

In [None]:
# plot the variable "16-24"

df["16-24"].plot()

In [None]:
# plot the variable "16-24" and the variable "65+" in the same plot

df["16-24"].plot()
df["65+"].plot()

In [None]:
# compute the mean of all but the first column

age_means = df.iloc[:, 1:].mean()
age_means

In [None]:
# histogram of age_means

age_means.plot(kind="bar")