# Introduction to Pandas and Series Data

## The `Series` Datastructure

- more like a cross between list and disctionary.

In [2]:
import pandas as pd

In [3]:
# you can create a series by passing in a list of values. 
# When you do this, Pandas automatically assigns an index starting with zero and
# sets the name of the series to **None**.

students = ['Alice', 'Jack', 'Molly']

pd.Series(students)


# the pandas has automatically identified the type of data in this Series as "object" and
# set the dytpe parameter as appropriate. We see that the values are indexed with integers,
# starting at zero

0    Alice
1     Jack
2    Molly
dtype: object

In [4]:
# Underneath panda stores series values in a 
# typed array using the Numpy library. This offers ***significant speedup*** when processing data 
# versus traditional python lists.

numbers = [1, 2, 3]
# And turn that into a series
pd.Series(numbers)  



0    1
1    2
2    3
dtype: int64

In [5]:
# how Numpy and thus pandas handle missing data. 
# In Python, we have the **none type*** to indicate a **lack of data**. But what do we do if we want 
# to have a typed list like we do in the series object?


#  Underneath, pandas does some type conversion. If we create a list of strings and we have 
# one element, a None type, pandas inserts it as a None and uses the type object for the 
# underlying array. 


students = ['Alice', 'Jack', None]
# And let's convert this to a series
pd.Series(students)

# although the last data is a none type, pandas still asssigns it an object type

0    Alice
1     Jack
2     None
dtype: object

In [6]:
# for numbers, the none type is converted into a NaN ( a special floating point value)

numbers = [1, 2, None]
# And turn that into a series
pd.Series(numbers)

# pandas set the dytpe of this series to floating point numbers instead of object or ints. T
# Underneath, pandas represents NaN as a floating point number, and because integers can be typecast to
# floats, pandas went and converted our integers to floats. 


# so ehen our series of int dtype brings up floats in padas, it reps **missing data**

0    1.0
1    2.0
2    NaN
dtype: float64

In [7]:
# None and NaN might be being used by the data scientist in the same way, to
# denote missing data, but that underneath these are not represented by pandas in the same
# way.

# NaN is *NOT* equivilent to None and when we try the equality test, the result is False.

# Lets bring in numpy which allows us to generate an NaN value
import numpy as np

np.nan == None # false bcos nan is not nan


False

In [8]:
# It turns out that you actually can't do an equality test of NAN to itself. When you do, 
# the answer is always False. 

np.nan == np.nan

# One key characteristic of NaN is that it's incomparable to other values, including itself.

False

In [9]:
# Instead, you need to use special functions to test for the presence of not a number, 
# such as the Numpy library np.isnan(value)

np.isnan(np.nan) # returns true since we explicitly want to compare them

True

In [10]:
# A series can be created directly from dictionary data. 
# the index is automatically assigned to the keys of the dictionary 
# that you provided and not just incrementing integers.


students_scores = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English'}

s = pd.Series(students_scores)
s.index # prints the index (i.e. keys of the dict)
s.values

array(['Physics', 'Chemistry', 'English'], dtype=object)

In [26]:
# from the above, the index, the first column, is also a list of strings.
# The dtype of object is not just for strings, but for arbitrary objects.e.g. tuples

students = [("Alice","Brown"), ("Jack", "White"), ("Molly", "Green")]
s = pd.Series(students)
s               #the tuples is stored in the series object, and the type is object.
# s.values
# s.values[1][0]  # we want to find jack


'Jack'

In [27]:
# Create your own index from the data by passing in the index as a 
# list explicitly to the series.

s = pd.Series(['Physics', 'Chemistry', 'English'], index=['Alice', 'Jack', 'Molly'])
#               vals of the index (keys)                     index vals (keys)                  
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [32]:
# if your list of values in the index object are not aligned with the keys 
# pandas overrides the automatic creation to favor the indices values that you provided. 
# It will ignore from your dictionary all keys which are not in your index, and pandas will 
# add None or NaN type values for any index value you provide, which is not in your dictionary key list.

students_scores = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English'}
# When I create the series object though I'll only ask for an index with three students, and
# I'll exclude Jack
s = pd.Series(students_scores, index=['Alice', 'Molly', 'Sam'])
print(s)     # NAN for "sam" b/c no index is called "sam"
      # Since Jack was not called, the program ignores it
# pd.Series(students_scores, index=['Alice', 'Molly', 'Sam',"Jack"]) # calls Jack since it was specified. 

Alice    Physics
Molly    English
Sam          NaN
dtype: object


Alice      Physics
Molly      English
Sam            NaN
Jack     Chemistry
dtype: object