## Step 1: Set Up Imports

In [181]:
import pandas as pd
import plotly.express as px
import numpy as np

## Step 1b: Import the data (issue with encoder)

In [182]:
#data had an encoding issue. Byte was not compatible with UTF-8
#cp1252 was compatible for decoding 0x96 byte

df = pd.read_csv('/Users/kali/Downloads/Taylor_Train.csv', encoding='cp1252')
df.head()

Unnamed: 0,City,Country,Venue,Opening act(s),Attendance (tickets sold / available),Revenue,Tour
0,Evansville,United States,Roberts Municipal Stadium,Gloriana\r\nKellie Pickler,"7,463 / 7,463","$360,617",Fearless_Tour
1,Jonesboro,United States,Convocation Center,Gloriana\r\nKellie Pickler,"7,822 / 7,822","$340,328",Fearless_Tour
2,St. Louis,United States,Scottrade Center,Gloriana\r\nKellie Pickler,"13,764 / 13,764","$650,420",Fearless_Tour
3,Alexandria,United States,Bishop Ireton High School,Gloriana\r\nKellie Pickler,ï¿½,ï¿½,Fearless_Tour
4,North Charleston,United States,North Charleston Coliseum,Gloriana\r\nKellie Pickler,"8,751 / 8,751","$398,154",Fearless_Tour


## Step 2: Tidy Data

In [183]:
#Pep 8 compliance: Column headers be snake cased. 

df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')
df.columns

Index(['city', 'country', 'venue', 'opening_act(s)',
       'attendance_(tickets_sold_/_available)', 'revenue', 'tour'],
      dtype='object')

## Step 3:EDA(Exploratory Data Analysis)

To gather summary statistics about our dataset we can use the following methods to identify if any further tidying is necessary.

* df.describe()

* df.info()

* df.shape

* df.value_counts()

In [184]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445 entries, 0 to 444
Data columns (total 7 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   city                                   445 non-null    object
 1   country                                445 non-null    object
 2   venue                                  445 non-null    object
 3   opening_act(s)                         444 non-null    object
 4   attendance_(tickets_sold_/_available)  442 non-null    object
 5   revenue                                442 non-null    object
 6   tour                                   445 non-null    object
dtypes: object(7)
memory usage: 24.5+ KB


Clean revenue column name

In [185]:
df['revenue'] = df['revenue'].str.strip().str.replace('-', '').str.replace('$', '').str.replace(',', '').astype(float, copy=None, errors='ignore')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445 entries, 0 to 444
Data columns (total 7 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   city                                   445 non-null    object
 1   country                                445 non-null    object
 2   venue                                  445 non-null    object
 3   opening_act(s)                         444 non-null    object
 4   attendance_(tickets_sold_/_available)  442 non-null    object
 5   revenue                                442 non-null    object
 6   tour                                   445 non-null    object
dtypes: object(7)
memory usage: 24.5+ KB


change revenue to float data type

In [186]:
#df["revenue"] = df["revenue"].astype(float, errors= 'coerce')

df["revenue"] = pd.to_numeric(df["revenue"], errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445 entries, 0 to 444
Data columns (total 7 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   city                                   445 non-null    object 
 1   country                                445 non-null    object 
 2   venue                                  445 non-null    object 
 3   opening_act(s)                         444 non-null    object 
 4   attendance_(tickets_sold_/_available)  442 non-null    object 
 5   revenue                                406 non-null    float64
 6   tour                                   445 non-null    object 
dtypes: float64(1), object(6)
memory usage: 24.5+ KB


fill in revenue null values

In [187]:
df["revenue"].fillna(df["revenue"].mean(), inplace=True)

check revenue filled in Null Values

In [188]:
df.isna().sum()

city                                     0
country                                  0
venue                                    0
opening_act(s)                           1
attendance_(tickets_sold_/_available)    3
revenue                                  0
tour                                     0
dtype: int64

cleaning column name for attendance_etc,

In [189]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445 entries, 0 to 444
Data columns (total 7 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   city                                   445 non-null    object 
 1   country                                445 non-null    object 
 2   venue                                  445 non-null    object 
 3   opening_act(s)                         444 non-null    object 
 4   attendance_(tickets_sold_/_available)  442 non-null    object 
 5   revenue                                445 non-null    float64
 6   tour                                   445 non-null    object 
dtypes: float64(1), object(6)
memory usage: 24.5+ KB


First have to create two new columns to get rid of the slash

In [190]:
df['tick_sold'] = df['attendance_(tickets_sold_/_available)'].str.split(" / ").str[0]
df['tick_avail'] = df['attendance_(tickets_sold_/_available)'].str.split(" / ").str[1]

In [None]:
df["'df_tick_sold'"] = pd.to_numeric(df["'df_tick_sold'"], errors='coerce')
df.info()

dropping the attendance_etc column

In [191]:
df.drop(columns='attendance_(tickets_sold_/_available)', inplace=True)
df.head()

Unnamed: 0,city,country,venue,opening_act(s),revenue,tour,tick_sold,tick_avail
0,Evansville,United States,Roberts Municipal Stadium,Gloriana\r\nKellie Pickler,360617.0,Fearless_Tour,7463,7463.0
1,Jonesboro,United States,Convocation Center,Gloriana\r\nKellie Pickler,340328.0,Fearless_Tour,7822,7822.0
2,St. Louis,United States,Scottrade Center,Gloriana\r\nKellie Pickler,650420.0,Fearless_Tour,13764,13764.0
3,Alexandria,United States,Bishop Ireton High School,Gloriana\r\nKellie Pickler,3892357.0,Fearless_Tour,ï¿½,
4,North Charleston,United States,North Charleston Coliseum,Gloriana\r\nKellie Pickler,398154.0,Fearless_Tour,8751,8751.0


In [194]:
df['tick_sold'].fillna(df['tick_sold'].mean(), inplace=True)
df.info()

TypeError: can only concatenate str (not "int") to str

In [None]:
df["'df_tick_sold'"] = pd.to_numeric(df["'df_tick_sold'"], errors='coerce')
df.info()

KeyError: "'df_tick_sold'"

In [None]:
df.isna().sum()

city                                       0
country                                    0
venue                                      0
opening_act(s)                             1
attendance_(tickets_sold_/_available)    445
revenue                                    0
tour                                       0
dtype: int64

In [None]:
px.histogram(df, 'revenue')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445 entries, 0 to 444
Data columns (total 7 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   city                                   445 non-null    object 
 1   country                                445 non-null    object 
 2   venue                                  445 non-null    object 
 3   opening_act(s)                         444 non-null    object 
 4   attendance_(tickets_sold_/_available)  0 non-null      float64
 5   revenue                                445 non-null    float64
 6   tour                                   445 non-null    object 
dtypes: float64(2), object(5)
memory usage: 24.5+ KB
