# Extracting Dates from Pandas

In [None]:
years = pd.to_datetime(df.ReleaseDate)
df['year'] = pd.DatetimeIndex(df.ReleaseDate).year

# Getting The total minutes from a Pandas Object 

In [None]:
df['runtime'] = pd.DatetimeIndex(df.runtime).minute + pd.DatetimeIndex(df.runtime).hour * 60

# KeyError When plotting dates

This is a particularly insidious error because it is completely unclear what is going on. When using `.plot()` in Pandas, it is only looking for numeric columns. To solve this problem, we set the time variable as the index and then plot it with the `o` style to make it not be connected.

Another alternative is to recast the date as an `np.int64` and then tell mpl that it is a date somehow. 

In [None]:
df = df.set_index(df.ReleaseDate)
df.DomesticTotalGross.plot(figsize=(15,6), style='o');

# Set Gotchas

Intersection only work with `s & t` or `s.intersect(t)`. NOT `s and t`

# Checking the Rows that dropped out after merging 

In [None]:
df2[(~df2.title.isin(merged_df.title))]

#  Plotting Groupby Projects

# Casting timedelta64 as a float

In [None]:
df.delta = df.delta.apply(lambda x: x / np.timedelta64(1, 'D'))

# Numpy Arrays given only `ints` on init and floats 

In [1]:
import numpy as np

In [12]:
arr.dtype

dtype('int64')

In [5]:
arr = np.array([2, 3, 123])
arr

array([  2,   3, 123])

In [10]:
arr[0] = 7.9

In [11]:
arr

array([  7,   3, 123])

# Scientific Notation results in floats

In [2]:
range(1e6)

TypeError: 'float' object cannot be interpreted as an integer

# Finding number of unique items per column

In [None]:
[(i, len(combined_data[i].unique())) for i in combined_data.columns]

# Casting Column with Lists to new Columns

In [None]:
businesses.from_records(businesses.categories.tolist())

# Multiply the top n columns together

This was also a challenging move in Pandas. 

In [1]:
def define_success(age, avg_rating, reviews_per_month):
    """ 
    Defines success according to the following metric:
    
    Highest 2 of the 3: Age, Average Rating, Reviews/Month where each of the values
    are standardized such taht the standard deviation is 10 and the mean is 0.
    """
    df = pd.DataFrame(dict(age=age, avg_rating=avg_rating,
                           reviews_per_month=reviews_per_month))
    
    sorted_vals = np.sort(df.values, axis=1)
    df['success_metric'] = sorted_vals[:, -1] * sorted_vals[:, -2]
    
    # Deals with edge case where two highest success metrics are 0.
    df.loc[sorted_vals[:, -2] < 0, 'success_metric'] = 0
    
    return df

# Appending a list to a DataFrame

This is ridiculously hard considering how often it comes up.

In [None]:
row_s = pd.Series(row, index=df.columns)
df = df.append(row, ignore_index=True)

An alternative to the above solution is read dictionaries. This is substantially easier.

# Dealing with Dates

In [None]:
# Selecting the times 
df = df[df.timestamp > (df.failure_dates - pd.DateOffset(days=10))]

# Plotting Groups

In [None]:
for i, group in df.groupby('car_id'):
    plt.figure()
    group['torque'].plot(title="Group {}".format(i))

# Case-Like Syntax

Say we want to map the failure dates based on the car_id

In [2]:
from pandas import Timestamp

In [3]:
failure_dates = {6773: Timestamp('2015-05-19 00:00:00'), 10982: Timestamp('2015-05-31 00:00:00'),
                 17528: Timestamp('2015-05-31 00:00:00'), 18827: Timestamp('2015-05-31 00:00:00'),
                 28902: Timestamp('2015-05-31 00:00:00'), 30528: Timestamp('2015-05-31 00:00:00'),
                 31078: Timestamp('2015-05-31 00:00:00'), 48835: Timestamp('2015-05-31 00:00:00'),
                 96085: Timestamp('2015-05-24 00:00:00'), 97317: Timestamp('2015-05-31 00:00:00')}

In [None]:
df['failure_dates'] = 0
df.failure_dates = df.car_id.map(failure_dates)