# Importing and manipulating data with Pandas

In [1]:
# Import packages
import pandas as pd

In [2]:
# Path to CSV file and CSV filename
csv_file = 'data/amount-data.csv'

In [3]:
# Read the CSV file using pandas
# This function automatically parses the csv format and nicely organizes the data in an indexed structure called DataFrame.
df = pd.read_csv(csv_file)

In [4]:
# Display the data
df

Unnamed: 0,day,amount
0,2013-01-24,323.0
1,2013-01-25,233.0
2,2013-01-26,433.0
3,2013-01-27,555.0
4,2013-01-28,123.0
5,2013-01-29,
6,2013-01-30,221.0


In [5]:
# Display the type of the data
type(df)

pandas.core.frame.DataFrame

In [6]:
# See the top three rows
df.head(3)

Unnamed: 0,day,amount
0,2013-01-24,323.0
1,2013-01-25,233.0
2,2013-01-26,433.0


In [7]:
# See the last three rows
df.tail(3)

Unnamed: 0,day,amount
4,2013-01-28,123.0
5,2013-01-29,
6,2013-01-30,221.0


In [8]:
# Info about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   day     7 non-null      object 
 1   amount  6 non-null      float64
dtypes: float64(1), object(1)
memory usage: 244.0+ bytes


In [9]:
# Generate descriptive statistics
df.describe()

Unnamed: 0,amount
count,6.0
mean,314.666667
std,157.487354
min,123.0
25%,224.0
50%,278.0
75%,405.5
max,555.0


In [10]:
# Display the columns
df.columns

Index(['day', 'amount'], dtype='object')

In [11]:
# Display the datatype for each column
df.dtypes

day        object
amount    float64
dtype: object

In [12]:
# Display specific columns from the dataframe
df['amount']

0    323.0
1    233.0
2    433.0
3    555.0
4    123.0
5      NaN
6    221.0
Name: amount, dtype: float64

In [13]:
df.amount

0    323.0
1    233.0
2    433.0
3    555.0
4    123.0
5      NaN
6    221.0
Name: amount, dtype: float64

In [15]:
# Select multiple columns
df[['day', 'amount']]

Unnamed: 0,day,amount
0,2013-01-24,323.0
1,2013-01-25,233.0
2,2013-01-26,433.0
3,2013-01-27,555.0
4,2013-01-28,123.0
5,2013-01-29,
6,2013-01-30,221.0


In [14]:
# Select multiple columns
column_list = ['day', 'amount']

df[column_list]

Unnamed: 0,day,amount
0,2013-01-24,323.0
1,2013-01-25,233.0
2,2013-01-26,433.0
3,2013-01-27,555.0
4,2013-01-28,123.0
5,2013-01-29,
6,2013-01-30,221.0


In [None]:
# Calculate mean
df['amount'].mean()

np.float64(314.6666666666667)

In [18]:
# Calculate stand deviation
df['amount'].std()

np.float64(157.48735398966696)

In [19]:
# Min amount
df['amount'].min()

np.float64(123.0)

In [20]:
# Max amount
df['amount'].max()

np.float64(555.0)

# Open a remote file or database like a CSV

In [21]:
# Opening a remote CSV file
url = 'https://gist.githubusercontent.com/kevin336/acbb2271e66c10a5b73aacf82ca82784/raw/e38afe62e088394d61ed30884dd50a6826eee0a8/employees.csv'
df_remote_data = pd.read_csv(url)

# Display the top few rows
df_remote_data.head()

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
0,198,Donald,OConnell,DOCONNEL,650.507.9833,21-JUN-07,SH_CLERK,2600,-,124,50
1,199,Douglas,Grant,DGRANT,650.507.9844,13-JAN-08,SH_CLERK,2600,-,124,50
2,200,Jennifer,Whalen,JWHALEN,515.123.4444,17-SEP-03,AD_ASST,4400,-,101,10
3,201,Michael,Hartstein,MHARTSTE,515.123.5555,17-FEB-04,MK_MAN,13000,-,100,20
4,202,Pat,Fay,PFAY,603.123.6666,17-AUG-05,MK_REP,6000,-,201,20


In [22]:
# Select specific columns
df_remote_data2 = df_remote_data[['EMAIL', 'SALARY']]

In [24]:
# Return the top 5 rows
df_remote_data2.head()

Unnamed: 0,EMAIL,SALARY
0,DOCONNEL,2600
1,DGRANT,2600
2,JWHALEN,4400
3,MHARTSTE,13000
4,PFAY,6000


In [26]:
# Apply filters to dataframe
# get rows where salary is less than 2600
df_remote_data2[df_remote_data2['SALARY'] < 2600]

Unnamed: 0,EMAIL,SALARY
28,KCOLMENA,2500
36,JLANDRY,2400
37,SMARKLE,2200
40,JAMRLOW,2500
41,TJOLSON,2100
44,KGEE,2400
45,HPHILTAN,2200
49,JPATEL,2500


In [None]:
# Apply filters to dataframe
# get rows where salary is more than 2600
df_remote_data2[df_remote_data2['SALARY'] > 2600]

Unnamed: 0,EMAIL,SALARY
2,JWHALEN,4400
3,MHARTSTE,13000
4,PFAY,6000
5,SMAVRIS,6500
6,HBAER,10000
7,SHIGGINS,12008
8,WGIETZ,8300
9,SKING,24000
10,NKOCHHAR,17000
11,LDEHAAN,17000
