# Antology of machine learning stepic course pandas examples

## Series

In [115]:
import numpy as np
import pandas as pd
# Restricting number of displaying rows, just for convenience
pd.set_option('max_rows', 8)

### Load data

In [2]:
movie = pd.read_csv('data/movie.csv')
# Take 1 column-series
directors = movie['director_name']

In [5]:
directors

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
              ...        
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

### Some series attributes

These attributes contain name of series, type of its values and index

In [8]:
print(directors.name)
print(directors.dtype)
print(directors.index)

director_name
object
RangeIndex(start=0, stop=4916, step=1)


Size of Series can be found in several ways

In [71]:
print(directors.size)
print(directors.shape)
print(directors.count())

4916
(4916,)
4814


In *values* you can find content of series

In [9]:
directors.values

array(['James Cameron', 'Gore Verbinski', 'Sam Mendes', ...,
       'Benjamin Roberds', 'Daniel Hsia', 'Jon Gunn'], dtype=object)

Whether a Series has missing values - NA

In [88]:
directors.hasnans

True

### Comparison of dataframe and series methods
Both of them have pretty much shared methods

In [23]:
print(f'Series methods - {len(dir(directors))}\n'
      f'Dataframe methods - {len(dir(movie))}\n'
      f'Number of shared methods - {len(set(dir(directors)).intersection(set(dir(movie))))}')

Series methods - 444
Dataframe methods - 482
Number of shared methods - 383


### Some series methods

In [96]:
# Start of Series
directors.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [97]:
# End of Series
directors.tail(7)

4909     Anthony Vallone
4910        Edward Burns
4911         Scott Smith
4912                 NaN
4913    Benjamin Roberds
4914         Daniel Hsia
4915            Jon Gunn
Name: director_name, dtype: object

In [44]:
# Unique values in Series
directors.unique()

array(['James Cameron', 'Gore Verbinski', 'Sam Mendes', ...,
       'Scott Smith', 'Benjamin Roberds', 'Daniel Hsia'], dtype=object)

In [46]:
# Number of unique elements
directors.nunique()

2397

In [6]:
# Number of occurences of each element in series
directors.value_counts()

Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Clint Eastwood      20
                    ..
Ian Iqbal Rashid     1
S.R. Bindler         1
Mike Gabriel         1
Michael McGowan      1
Name: director_name, Length: 2397, dtype: int64

In [84]:
# Frequency of elements (in addition has sort argument and whether to include NA)
directors.value_counts(normalize=True)

Steven Spielberg    0.005401
Woody Allen         0.004570
Martin Scorsese     0.004155
Clint Eastwood      0.004155
                      ...   
Ian Iqbal Rashid    0.000208
S.R. Bindler        0.000208
Mike Gabriel        0.000208
Michael McGowan     0.000208
Name: director_name, Length: 2397, dtype: float64

### Some conversion methods

In [65]:
# Conversion of values to list
directors.tolist()

['James Cameron',
 'Gore Verbinski',
 'Sam Mendes',
 'Christopher Nolan',
 'Doug Walker',
 'Andrew Stanton',
 'Sam Raimi',
 'Nathan Greno',
 'Joss Whedon',
 'David Yates',
 'Zack Snyder',
 'Bryan Singer',
 'Marc Forster',
 'Gore Verbinski',
 'Gore Verbinski',
 'Zack Snyder',
 'Andrew Adamson',
 'Joss Whedon',
 'Rob Marshall',
 'Barry Sonnenfeld',
 'Peter Jackson',
 'Marc Webb',
 'Ridley Scott',
 'Peter Jackson',
 'Chris Weitz',
 'Peter Jackson',
 'James Cameron',
 'Anthony Russo',
 'Peter Berg',
 'Colin Trevorrow',
 'Sam Mendes',
 'Sam Raimi',
 'Shane Black',
 'Tim Burton',
 'Brett Ratner',
 'Dan Scanlon',
 'Michael Bay',
 'Michael Bay',
 'Sam Raimi',
 'Marc Webb',
 'Joseph Kosinski',
 'John Lasseter',
 'Martin Campbell',
 'Lee Unkrich',
 'McG',
 'James Wan',
 'Marc Forster',
 'Bryan Singer',
 'J.J. Abrams',
 'Bryan Singer',
 'Baz Luhrmann',
 'Mike Newell',
 'Guillermo del Toro',
 'Michael Bay',
 'Steven Spielberg',
 'Peter Sohn',
 'Mark Andrews',
 'Justin Lin',
 'Andrew Stanton',
 'Br

In [66]:
# Conversion to dictionary index: value
directors.to_dict()

{0: 'James Cameron',
 1: 'Gore Verbinski',
 2: 'Sam Mendes',
 3: 'Christopher Nolan',
 4: 'Doug Walker',
 5: 'Andrew Stanton',
 6: 'Sam Raimi',
 7: 'Nathan Greno',
 8: 'Joss Whedon',
 9: 'David Yates',
 10: 'Zack Snyder',
 11: 'Bryan Singer',
 12: 'Marc Forster',
 13: 'Gore Verbinski',
 14: 'Gore Verbinski',
 15: 'Zack Snyder',
 16: 'Andrew Adamson',
 17: 'Joss Whedon',
 18: 'Rob Marshall',
 19: 'Barry Sonnenfeld',
 20: 'Peter Jackson',
 21: 'Marc Webb',
 22: 'Ridley Scott',
 23: 'Peter Jackson',
 24: 'Chris Weitz',
 25: 'Peter Jackson',
 26: 'James Cameron',
 27: 'Anthony Russo',
 28: 'Peter Berg',
 29: 'Colin Trevorrow',
 30: 'Sam Mendes',
 31: 'Sam Raimi',
 32: 'Shane Black',
 33: 'Tim Burton',
 34: 'Brett Ratner',
 35: 'Dan Scanlon',
 36: 'Michael Bay',
 37: 'Michael Bay',
 38: 'Sam Raimi',
 39: 'Marc Webb',
 40: 'Joseph Kosinski',
 41: 'John Lasseter',
 42: 'Martin Campbell',
 43: 'Lee Unkrich',
 44: 'McG',
 45: 'James Wan',
 46: 'Marc Forster',
 47: 'Bryan Singer',
 48: 'J.J. Abr

In [7]:
# Conversion of Series to DataFrame
directors.to_frame()

Unnamed: 0,director_name
0,James Cameron
1,Gore Verbinski
2,Sam Mendes
3,Christopher Nolan
...,...
4912,
4913,Benjamin Roberds
4914,Daniel Hsia
4915,Jon Gunn


### Writing to csv

In [71]:
directors.to_csv('filename', index=False)

In [50]:
list(filter(lambda x: not x.startswith('_'), dir(pd.Series.str)))

['capitalize',
 'cat',
 'center',
 'contains',
 'count',
 'decode',
 'encode',
 'endswith',
 'extract',
 'extractall',
 'find',
 'findall',
 'get',
 'get_dummies',
 'index',
 'isalnum',
 'isalpha',
 'isdecimal',
 'isdigit',
 'islower',
 'isnumeric',
 'isspace',
 'istitle',
 'isupper',
 'join',
 'len',
 'ljust',
 'lower',
 'lstrip',
 'match',
 'normalize',
 'pad',
 'partition',
 'repeat',
 'replace',
 'rfind',
 'rindex',
 'rjust',
 'rpartition',
 'rsplit',
 'rstrip',
 'slice',
 'slice_replace',
 'split',
 'startswith',
 'strip',
 'swapcase',
 'title',
 'translate',
 'upper',
 'wrap',
 'zfill']

In [114]:
directors.value_counts()[:'Clint Eastwood':2]

Steven Spielberg    26
Martin Scorsese     20
Name: director_name, dtype: int64