# ** Part 2a: Introduction to Pandas and NumPy for Data Analysis**


# **INTRODUCTION TO NUMPY**

In [None]:
import numpy as np #Importing the NumPy Library


In [None]:
#Creating a 1D ndarray
data_ndarray = np.array([5, 10, 15, 20])

#Ndarrays and Numpy Library simplifies data manipulation and analysis

In standard python, we might use a list of lists to represents datasets, while it might be cool for small datasets, it is not ideal for larger ones.

During iteration, python converts our code into bytecode, which instructs our computer's processor to add numbers. **NumPy leverages a processor feature called Single Instruction Multiple Data (SIMD)** for faster data processing

In [None]:
import csv
import numpy as np

#importing nyc_taxi.csv as a list of lists
f = open("/kaggle/input/part2/nyc_taxis.csv", "r")
taxi_list = list(csv.reader(f))

In [None]:
#remove the header row
taxi_list = taxi_list[1:]



In [None]:
#Convert all values to floats
converted_taxi_list = []
for row in taxi_list:
  converted_row = []
  for element in row:
    converted_row.append(float(element))
  converted_taxi_list.append(converted_row)

In [None]:
taxi = np.array( converted_taxi_list)

In [None]:
print(taxi)

In [None]:
#  When we cant easily print the entire ndarray, we can use 
# 'ndarray.shape' to identify the rows and columns.

taxi.shape

In [None]:
# Select all columns for a given set of rows
ndarray[row_index]

# Select particular columns for a given set of rows
ndarray[row_index, column_index]

In [None]:
# Vector Operations
trip_distance_miles = taxi[:, 7]
trip_length_seconds = taxi[:, 8]

trip_length_hours = trip_length_seconds / 3600
trip_length_hours

In [None]:
#Numpy comes with a variety of methods to make calculations a breeze
# 1. ndarray.min()
# 2. ndarray.max()
# 3. ndarray.mean()
# 3. ndarray.sum()


**BOOLEAN INDEXING WITH NUMPY**

In [None]:
#Importing CSV file into an array directly.

import numpy as np
taxi = np.genfromtxt("/kaggle/input/part2/nyc_taxis.csv", delimiter = ",") 
# we can also use the 'skip_header = 1' arguement to skip header row
taxi

In [None]:
print(taxi[0])

In [None]:
# Removing the header from our 'ndarray'
taxi = taxi[1:]

In [None]:
# Checking the internal data type
print(taxi.dtype)

# NB: ndarray can only contain one data type

In [None]:
# Calculate the average speed
trip_mph = (taxi[:, 7]) / (taxi[:, 8] / 3600)

In [None]:
# Assigning values in ndarray
a = np.array(['red', 'blue', 'black', 'blue', 'purple'])
a[0] = 'orange'
print(a)

In [None]:
a[3:] = 'pink'
print(a)

# **INTRODUCTION TO PANDAS**

In [None]:
import pandas as pd
f500 = pd.read_csv('/kaggle/input/part2/f500.csv', index_col = 0)
f500.index.name = None
print(type(500))
print(f500.shape)

In [None]:
f500.head(3)

In [None]:
f500.tail(3)

In [None]:
f500.dtypes

In [None]:
f500.info()

In [None]:
df.loc[row_label, column_label]

In [None]:
#Selecting the rank column
rank_col = f500["rank"]
rank_col

In [None]:
# Series Data Exploration Method
Series.max()
Series.min()
Series.mean()
Series.median()
Series.mode()
Series.sum()

Series.describe()


In [None]:
assets = f500["assets"]
assets.describe()

In [None]:
countries_counts = f500["country"]
countries_counts

In [None]:
f500.head(5)

In [None]:
# 'Method Chaining' is the process of combining multiple methods together in
# a single line
countries_counts = f500["country"].value_counts() 
countries_counts #Unique value counts of the column "country"

In [None]:
Locating_China = f500["country"].value_counts().loc["China"]
Locating_China 

In [None]:
f500.max(numeric_only = True)

In [None]:
describe = f500.describe(include=['0']) #To get just the object columns
describe

In [None]:
top5_rank_revenue = f500[["rank", "revenues"]].head()
top5_rank_revenue

In [None]:
# Assigning 0 to the 'revenue' column
top5_rank_revenue = f500["revenues"] = 0
top5_rank_revenue

In [None]:
top5_rank_revenue = f500[["rank", "revenues"]].head()
top5_rank_revenue

In [None]:
#Creating new columns
top5_rank_revenue["year_founded"] = 0 #year_founded is a new column
top5_rank_revenue

In [None]:
first_column = f500.iloc[:,0]
first_column

In [None]:
second_to_sixth_rows = f500[1:5]
second_to_sixth_rows 

In [None]:
#Python Boolean Operator - ==, <, >

In [None]:
rev_is_null = f500["revenue_change"].isnull()
rev_is_null.head()

**DATA CLEANING BASICS**

In [None]:
#Reading data into pandas
import pandas as pd
laptops = pd.read_csv("/kaggle/input/part2/laptops.csv", encoding = "Latin-1")
laptops

#Computers at their lowest levels, can only understand binary 0 and 1.
#Encoding are systems for representing characters in binary.

In [None]:
laptops.info()

In [None]:
#The first steps in cleaning data is to check if there are no whitespaces

#Lets check the columns

laptops.columns

In [None]:
#Removing any possible white space from the column labels


laptops.columns.str.strip()


In [None]:
laptops.head(5)

Other cleaning tasks that can be done on the column labels are
* Replacing spaces with underscores
* Removing special characters
* Making all labels lowercase
* Shortening any long column names

In [None]:
#A function that uses python string methods to clean column labels, using a 
# loop to apply that function to each label

def clean_col(col):
    col = col.strip()
    col = col.replace("(","")
    col = col.replace(")","")
    col = col.lower()
    return col

new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)

laptops.columns = new_columns
laptops.columns

In [None]:
laptops.head()

CONVERTING STRING COLUMNS TO NUMERIC

In [None]:
laptops.iloc[:5,2:5]

**Whenever we convert text to numeric data**, we can follow this data cleaning workflow.
* Explore the data in the column
* Identify patterns and special cases
* Remove non-digit characters
* Rename column if required
* Convert the column to a numeric dtype

In [None]:
# The next step is to identify patterns & special cases
laptops["screen size"].astype(float) #Changing the data type to float


In [None]:
#The first step is to explore the data, using series.unique()
laptops["screen size"].dtype

In [None]:
#Changing the data type
laptops["screen size"].astype(float)

In [None]:
#Renaming a column
laptops.rename({"screen size":"screen_size_inches"}, axis = 1, inplace = True)


In [None]:
laptops

In [None]:
# CORRECTING BAD VALUES


# We use 'series.map()'
It works by using using a dictionary

e.g

corrections = {
    "pair" : "pear"
    "oranje": "orange"
    "bananna": "banana"
}

s = s.map(corrections)

In [None]:
laptops.isnull().sum()
# We have only one column with null values

There are several ways to handle missing values:
* Remove any rows that have missing values
* Remove any columns that have missing values
* Fill the missing values with some other values
* Leave the missing values as is

The first two options are always used to prepare data for machine learning algorithms. We used 'df.dropna()'

While dropping rows or columns is the easiest approach to deal with missing values,it is not the best the idea.

It is a good idea to explore the missing values before making a decision. We can use 'series.value_counts' to explore all of the values in the column


In [None]:
laptops

In [None]:
laptops["operating system version"].value_counts()

In [None]:
os_with_null_values = laptops.loc[laptops["operating system version"].isnull()]


In [None]:
os_with_null_values.value_counts()