## Pyspark Machine Learning on Credit Card

This is a classification problem.

In [1]:
from pyspark import SparkContext 
from pyspark import SQLContext
import sys
import os

In [2]:
sc = SparkContext(appName ='ysh_PySparkML', master = 'local')
sc

In [4]:
## build the sqlContext 
sqlContext = SQLContext(sc)
sqlContext

<pyspark.sql.context.SQLContext at 0x112ceae50>

In [5]:
# import pandas as pd
# UniversalBank = pd.read_csv('/Users/tjmask/Desktop/Semester2/Spark/PySpark/Datasets/UniversalBank.csv')

### load data and choose features

In [6]:
## load data
path = 'file:///Users/tjmask/Desktop/Semester2/Spark/PySpark/Datasets/UniversalBank.csv'
UniversalBank = sqlContext.read.csv(path, header=True, inferSchema=True)
UniversalBank.show(5)

+---+---+----------+------+--------+------+-----+---------+--------+-------------+------------------+----------+------+----------+
| ID|Age|Experience|Income|ZIP Code|Family|CCAvg|Education|Mortgage|Personal Loan|Securities Account|CD Account|Online|CreditCard|
+---+---+----------+------+--------+------+-----+---------+--------+-------------+------------------+----------+------+----------+
|  1| 25|         1|    49|   91107|     4|  1.6|        1|       0|            0|                 1|         0|     0|         0|
|  2| 45|        19|    34|   90089|     3|  1.5|        1|       0|            0|                 1|         0|     0|         0|
|  3| 39|        15|    11|   94720|     1|  1.0|        1|       0|            0|                 0|         0|     0|         0|
|  4| 35|         9|   100|   94112|     1|  2.7|        2|       0|            0|                 0|         0|     0|         0|
|  5| 35|         8|    45|   91330|     4|  1.0|        2|       0|            0| 

In [11]:
UniversalBank.columns

['ID',
 'Age',
 'Experience',
 'Income',
 'ZIP Code',
 'Family',
 'CCAvg',
 'Education',
 'Mortgage',
 'Personal Loan',
 'Securities Account',
 'CD Account',
 'Online',
 'CreditCard']

CreditCard is the label, it has two kinds of values, 0: credit card failed, 1: credict card passed. We also see that ID and ZIP code are not helpful for predicting, we will move those. 

In [7]:
# UniversalBank.createOrReplaceTempView("df_Bank")

In [32]:
## select the useful features
selected_features = [c for c in UniversalBank.columns if c not in ['ZIP Code', 'ID']]
selected_features

['Age',
 'Experience',
 'Income',
 'Family',
 'CCAvg',
 'Education',
 'Mortgage',
 'Personal Loan',
 'Securities Account',
 'CD Account',
 'Online',
 'CreditCard']

Above is our selected feautures used for predictions.

In [33]:
## get the trimmed dataset
UniversalBank_trimmed = UniversalBank.select(selected_features)
UniversalBank_trimmed.show(5)

+---+----------+------+------+-----+---------+--------+-------------+------------------+----------+------+----------+
|Age|Experience|Income|Family|CCAvg|Education|Mortgage|Personal Loan|Securities Account|CD Account|Online|CreditCard|
+---+----------+------+------+-----+---------+--------+-------------+------------------+----------+------+----------+
| 25|         1|    49|     4|  1.6|        1|       0|            0|                 1|         0|     0|         0|
| 45|        19|    34|     3|  1.5|        1|       0|            0|                 1|         0|     0|         0|
| 39|        15|    11|     1|  1.0|        1|       0|            0|                 0|         0|     0|         0|
| 35|         9|   100|     1|  2.7|        2|       0|            0|                 0|         0|     0|         0|
| 35|         8|    45|     4|  1.0|        2|       0|            0|                 0|         0|     0|         1|
+---+----------+------+------+-----+---------+--------+-

### descriptive statistics

In [34]:
import pyspark.mllib.stat as st
import numpy as np

numeric_rdd = UniversalBank_trimmed.rdd.map(lambda row: [e for e in row])

mllib_stats = st.Statistics.colStats(numeric_rdd)

mllib_stats

for col, m, v in zip(selected_features, 
                     mllib_stats.mean(), 
                     mllib_stats.variance()):
    print('{0}: \t{1:.2f} \t {2:.2f}'.format(col, m, np.sqrt(v)))

Age: 	45.34 	 11.46
Experience: 	20.10 	 11.47
Income: 	73.77 	 46.03
Family: 	2.40 	 1.15
CCAvg: 	1.94 	 1.75
Education: 	1.88 	 0.84
Mortgage: 	56.50 	 101.71
Personal Loan: 	0.10 	 0.29
Securities Account: 	0.10 	 0.31
CD Account: 	0.06 	 0.24
Online: 	0.60 	 0.49
CreditCard: 	0.29 	 0.46


In [35]:
UniversalBank_trimmed.describe().show()

+-------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+
|summary|               Age|       Experience|           Income|            Family|             CCAvg|        Education|          Mortgage|      Personal Loan| Securities Account|         CD Account|            Online|        CreditCard|
+-------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+
|  count|              5000|             5000|             5000|              5000|              5000|             5000|              5000|               5000|               5000|               5000|              5000|              5000|
|   mean|           45.3384|          20.1046|  

### correlations between features

In [49]:
corrs = st.Statistics.corr(numeric_rdd)

for i, el in enumerate(corrs > 0.5):
    correlated = [
        (selected_features[j], corrs[i][j]) 
        for j, e in enumerate(el) 
        if e == 1.0 and j != i]
    
    if len(correlated) > 0:
        for e in correlated:
            print('{0}-to-{1}: {2:.2f}' \
                  .format(selected_features[i], e[0], e[1]))

Age-to-Experience: 0.99
Experience-to-Age: 0.99
Income-to-CCAvg: 0.65
Income-to-Personal Loan: 0.50
CCAvg-to-Income: 0.65
Personal Loan-to-Income: 0.50


We can drop most of highly correlated features.

### save data, reread, transform use LabeledPoint

In [82]:
import pyspark.mllib.feature as ft
import pyspark.mllib.regression as reg
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree

In [107]:
# UniversalBank_trimmed.rdd.map(lambda arr:[arr[c] for c in range(12)]).\
#             saveAsTextFile('/Users/tjmask/Desktop/Semester2/Spark/PySpark/Datasets/UniversalBank_trimmed', compressionCodecClass=int)
    
# points = sc.textFile('file:///Users/tjmask/Desktop/Semester2/Spark/PySpark/Datasets/UniversalBank_trimmed')
# points.map(lambda line: int(line))
# points.take(2)

In [108]:
## transform to RDD
points = UniversalBank_trimmed.rdd.map(lambda arr:[arr[c] for c in range(12)])

In [113]:
def parse_point(line):
#     values = [s for s in line.strip().split(',')]
    return LabeledPoint(line[11], line[0:11])

In [114]:
data_parsed = points.map(parse_point)
data_parsed.take(5)

[LabeledPoint(0.0, [25.0,1.0,49.0,4.0,1.6,1.0,0.0,0.0,1.0,0.0,0.0]),
 LabeledPoint(0.0, [45.0,19.0,34.0,3.0,1.5,1.0,0.0,0.0,1.0,0.0,0.0]),
 LabeledPoint(0.0, [39.0,15.0,11.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0]),
 LabeledPoint(0.0, [35.0,9.0,100.0,1.0,2.7,2.0,0.0,0.0,0.0,0.0,0.0]),
 LabeledPoint(1.0, [35.0,8.0,45.0,4.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0])]

In [116]:
train, test = data_parsed.randomSplit([0.7, 0.3])

### logit model

In [117]:
from pyspark.mllib.classification \
    import LogisticRegressionWithLBFGS

LR_Model = LogisticRegressionWithLBFGS \
    .train(train, iterations=10)

In [118]:
LR_results = (
        test.map(lambda row: row.label) \
        .zip(LR_Model.predict(test.map(lambda row: row.features)))
    ).map(lambda row: (row[0], row[1] * 1.0))

In [120]:
LR_results.take(5)

[(0.0, 0.0), (1.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0)]

In [121]:
import pyspark.mllib.evaluation as ev
LR_evaluation = ev.BinaryClassificationMetrics(LR_results)

print('Area under PR: {0:.2f}' \
      .format(LR_evaluation.areaUnderPR))
print('Area under ROC: {0:.2f}' \
      .format(LR_evaluation.areaUnderROC))
LR_evaluation.unpersist()

Area under PR: 0.13
Area under ROC: 0.72


In [136]:
Accuracy = LR_results.filter(lambda x: x[0]==x[1]).count()/(LR_results.count())
print('Accuracy: ', Accuracy)

Accuracy:  0.7491408934707904


### random forest with original features

In [146]:
from pyspark.mllib.tree import RandomForest

RF_model = RandomForest \
    .trainClassifier(data=train, 
                     numClasses=2, 
                     categoricalFeaturesInfo={}, 
                     numTrees=6,  
                     featureSubsetStrategy='all',
                     seed=666)

In [147]:
RF_results = (
        test.map(lambda row: row.label) \
        .zip(RF_model \
             .predict(test \
                      .map(lambda row: row.features)))
    )

RF_evaluation = ev.BinaryClassificationMetrics(RF_results)

print('Area under PR: {0:.2f}' \
      .format(RF_evaluation.areaUnderPR))
print('Area under ROC: {0:.2f}' \
      .format(RF_evaluation.areaUnderROC))
RF_evaluation.unpersist()

Area under PR: 0.15
Area under ROC: 0.81


In [148]:
Accuracy = RF_results.filter(lambda x: x[0]==x[1]).count()/(RF_results.count())
print('Accuracy: ', Accuracy)

Accuracy:  0.7450171821305842


### Selecting only the most predictable features¶

In [137]:
selector = ft.ChiSqSelector(4).fit(train)

topFeatures_train = (
        train.map(lambda row: row.label) \
        .zip(selector \
             .transform(train \
                        .map(lambda row: row.features)))
    ).map(lambda row: reg.LabeledPoint(row[0], row[1]))

topFeatures_test = (
        test.map(lambda row: row.label) \
        .zip(selector \
             .transform(test \
                        .map(lambda row: row.features)))
    ).map(lambda row: reg.LabeledPoint(row[0], row[1]))

In [141]:
topFeatures_test.take(1)

[LabeledPoint(0.0, [37.0,13.0,0.0,0.0])]

### random forest with reduced features

In [159]:
from pyspark.mllib.tree import RandomForest

RF_model_2 = RandomForest \
    .trainClassifier(data=topFeatures_train, 
                     numClasses=2, 
                     categoricalFeaturesInfo={}, 
                     numTrees=6,  
                     featureSubsetStrategy='all',
                     seed=666)

In [160]:
RF_results_2 = (
        topFeatures_test.map(lambda row: row.label) \
        .zip(RF_model_2 \
             .predict(topFeatures_test \
                      .map(lambda row: row.features)))
    )

RF_evaluation_2 = ev.BinaryClassificationMetrics(RF_results_2)

print('Area under PR: {0:.2f}' \
      .format(RF_evaluation_2.areaUnderPR))
print('Area under ROC: {0:.2f}' \
      .format(RF_evaluation_2.areaUnderROC))
RF_evaluation_2.unpersist()

Area under PR: 0.10
Area under ROC: 0.66


In [161]:
RF_results_2.take(5)

[(1.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0)]

In [162]:
Accuracy = RF_results_2.filter(lambda x: x[0]==x[1]).count()/(RF_results_2.count())
print('Accuracy: ', Accuracy)

Accuracy:  0.727147766323024


### logit with reduced featrues

In [163]:
LR_Model_2 = LogisticRegressionWithLBFGS \
    .train(topFeatures_train, iterations=10)

LR_results_2 = (
        topFeatures_test.map(lambda row: row.label) \
        .zip(LR_Model_2 \
             .predict(topFeatures_test \
                      .map(lambda row: row.features)))
    ).map(lambda row: (row[0], row[1] * 1.0))

LR_evaluation_2 = ev.BinaryClassificationMetrics(LR_results_2)

print('Area under PR: {0:.2f}' \
      .format(LR_evaluation_2.areaUnderPR))
print('Area under ROC: {0:.2f}' \
      .format(LR_evaluation_2.areaUnderROC))
LR_evaluation_2.unpersist()

Area under PR: 0.12
Area under ROC: 0.71


In [164]:
Accuracy = LR_results_2.filter(lambda x: x[0]==x[1]).count()/(LR_results_2.count())
print('Accuracy: ', Accuracy)

Accuracy:  0.7429553264604811


## get dummies 

In [211]:
import pyspark.sql.functions as F
df = sqlContext.createDataFrame([
    (1, "A", "X1"),
    (2, "B", "X2"),
    (3, "B", "X3"),
    (1, "B", "X3"),
    (2, "C", "X2"),
    (3, "C", "X2"),
    (1, "C", "X1"),
    (1, "B", "X1"),
], ["ID", "TYPE", "CODE"])

df.show()

# types = df.select("TYPE").distinct().rdd.flatMap(lambda x: x).collect()
# codes = df.select("CODE").distinct().rdd.flatMap(lambda x: x).collect()
# types_expr = [F.when(F.col("TYPE") == ty, 1).otherwise(0).alias("e_TYPE_" + ty) for ty in types]
# codes_expr = [F.when(F.col("CODE") == code, 1).otherwise(0).alias("e_CODE_" + code) for code in codes]
# df = df.select("ID", "TYPE", "CODE", *types_expr+codes_expr)
# df.show()

+---+----+----+
| ID|TYPE|CODE|
+---+----+----+
|  1|   A|  X1|
|  2|   B|  X2|
|  3|   B|  X3|
|  1|   B|  X3|
|  2|   C|  X2|
|  3|   C|  X2|
|  1|   C|  X1|
|  1|   B|  X1|
+---+----+----+



In [183]:
types = df.select('TYPE').distinct().rdd.flatMap(lambda x:x).collect()
codes = df.select('CODE').distinct().rdd.flatMap(lambda x:x).collect()
types_dummy = [F.when(F.col('TYPE') == ty, 1).otherwise(0).alias('e_TYPE_'+ty) for ty in types]
codes_dummy = [F.when(F.col('CODE') == code, 1).otherwise(0).alias('e_TYPE_'+code) for code in codes]

In [184]:
df.select(['*']+types_dummy+codes_dummy).show()

+---+----+----+--------+--------+--------+---------+---------+---------+
| ID|TYPE|CODE|e_TYPE_B|e_TYPE_C|e_TYPE_A|e_TYPE_X1|e_TYPE_X3|e_TYPE_X2|
+---+----+----+--------+--------+--------+---------+---------+---------+
|  1|   A|  X1|       0|       0|       1|        1|        0|        0|
|  2|   B|  X2|       1|       0|       0|        0|        0|        1|
|  3|   B|  X3|       1|       0|       0|        0|        1|        0|
|  1|   B|  X3|       1|       0|       0|        0|        1|        0|
|  2|   C|  X2|       0|       1|       0|        0|        0|        1|
|  3|   C|  X2|       0|       1|       0|        0|        0|        1|
|  1|   C|  X1|       0|       1|       0|        1|        0|        0|
|  1|   B|  X1|       1|       0|       0|        1|        0|        0|
+---+----+----+--------+--------+--------+---------+---------+---------+



In [218]:
import pyspark.ml.feature as ft
import pyspark.sql.types as typ

df = df \
    .withColumn(       'TYPE_DUMMY', 
                df['TYPE'] \
                    .cast(typ.IntegerType()))

In [219]:
df.show()

+---+----+----+----------+
| ID|TYPE|CODE|TYPE_DUMMY|
+---+----+----+----------+
|  1|   A|  X1|      null|
|  2|   B|  X2|      null|
|  3|   B|  X3|      null|
|  1|   B|  X3|      null|
|  2|   C|  X2|      null|
|  3|   C|  X2|      null|
|  1|   C|  X1|      null|
|  1|   B|  X1|      null|
+---+----+----+----------+



In [220]:
encoder = ft.OneHotEncoder(
    inputCol='TYPE_DUMMY', 
    outputCol='TYPE_DUMMY_VEC')

In [199]:
encoder = ft.OneHotEncoder(inputCol="TYPE_DUMMY", outputCol="TYPE")

In [222]:
labels = [
    ('ID', typ.IntegerType()),
    ('TYPE', typ.StringType()),
    ('CODE', typ.StringType())]

In [223]:
featuresCreator = ft.VectorAssembler(
    inputCols=[
        col[0] 
        for col 
        in labels[2:]] + \
    [encoder.getOutputCol()], 
    outputCol='features'
)

In [224]:
featuresCreator

VectorAssembler_fbc933d5e95b

### grid search 

In [264]:
import pyspark.ml.tuning as tune
import pyspark.ml.classification as cl
from pyspark.ml.classification import LogisticRegression
import pyspark.ml.evaluation as ev
from pyspark.ml import Pipeline

In [228]:
data_parsed.take(3)

[LabeledPoint(0.0, [25.0,1.0,49.0,4.0,1.6,1.0,0.0,0.0,1.0,0.0,0.0]),
 LabeledPoint(0.0, [45.0,19.0,34.0,3.0,1.5,1.0,0.0,0.0,1.0,0.0,0.0]),
 LabeledPoint(0.0, [39.0,15.0,11.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0])]

In [233]:
logistic = cl.LogisticRegression(
    labelCol='label')

grid = tune.ParamGridBuilder() \
    .addGrid(logistic.maxIter,  
             [2, 10, 50]) \
    .addGrid(logistic.regParam, 
             [0.01, 0.05, 0.3]) \
    .build()

In [234]:
evaluator = ev.BinaryClassificationEvaluator(
    rawPredictionCol='probability', 
    labelCol='label')

In [235]:
cv = tune.CrossValidator(
    estimator=logistic, 
    estimatorParamMaps=grid, 
    evaluator=evaluator
)

In [238]:
pipeline = Pipeline(stages=[encoder,featuresCreator])
data_transformer = pipeline.fit(train)

In [241]:
# model = pipeline.fit(train)
# test_model = model.transform(test)

In [303]:
# maxIter = [c for c in range(5,10)]
# regParam = [0.001,0.01,0.1,1]
# for i in maxIter:
#     for j in regParam:
#         logistic = LogisticRegression(maxIter=i, regParam=j) 
        
#         LR = logistic.fit(train.toDF())

#         LR_result = (
#                 topFeatures_test.map(lambda row: row.label) \
#                 .zip(LR \
#                      .predict(test \
#                               .map(lambda row: row.features)))
#             ).map(lambda row: (row[0], row[1] * 1.0))

#         LR_evaluation = ev.BinaryClassificationMetrics(LR_result)

#         print('MaxIter:{0}, reParam: {1}, Area under PR: {2:.2f}' \
#               .format(i,j,LR_evaluation.areaUnderPR))
#         print('MaxIter:{0}, reParam: {1}, Area under ROC: {2:.2f}' \
#               .format(i,j, LR_evaluation.areaUnderROC))
#         LR_evaluation.unpersist()

In [249]:
topFeatures_train.toDF().show(2)

+-------------------+-----+
|           features|label|
+-------------------+-----+
| [25.0,1.0,1.0,0.0]|  0.0|
|[45.0,19.0,1.0,0.0]|  0.0|
+-------------------+-----+
only showing top 2 rows



In [256]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

bdf = sc.parallelize([
Row(label=1.0, weight=1.0, features=Vectors.dense(0.0, 5.0)),
Row(label=0.0, weight=2.0, features=Vectors.dense(1.0, 2.0)),
Row(label=1.0, weight=3.0, features=Vectors.dense(2.0, 1.0)),
Row(label=0.0, weight=4.0, features=Vectors.dense(3.0, 3.0))]).toDF()

In [302]:
# blor = LogisticRegression(regParam=0.01)
# blorModel = blor.fit()
# blorModel.coefficients

In [287]:
df = train.toDF()

In [285]:
new_schema = ArrayType(DoubleType(), containsNull=False)
x_new = X(lambda x:x, new_schema)
# LDA = LDA.withColumn("topic_vector_fix_dim",udf_foo("topic_vector_fix_dim"))

TypeError: 'PipelinedRDD' object is not callable

In [293]:
# from pyspark.ml.feature import VectorAssembler
# vectorAssembler = VectorAssembler(inputCols = ["features"], outputCol = "features")
# df = vectorAssembler.transform(df)
# df = df.select(['features', 'labels'])

### NLP

In [304]:
text_data = sqlContext.createDataFrame([
    ['''Machine learning can be applied to a wide variety 
        of data types, such as vectors, text, images, and 
        structured data. This API adopts the DataFrame from 
        Spark SQL in order to support a variety of data types.'''],
    ['''DataFrame supports many basic and structured types; 
        see the Spark SQL datatype reference for a list of 
        supported types. In addition to the types listed in 
        the Spark SQL guide, DataFrame can use ML Vector types.'''],
    ['''A DataFrame can be created either implicitly or 
        explicitly from a regular RDD. See the code examples 
        below and the Spark SQL programming guide for examples.'''],
    ['''Columns in a DataFrame are named. The code examples 
        below use names such as "text," "features," and "label."''']
], ['input'])

In [305]:
text_data.show()

+--------------------+
|               input|
+--------------------+
|Machine learning ...|
|DataFrame support...|
|A DataFrame can b...|
|Columns in a Data...|
+--------------------+



In [310]:
tokenizer = ft.RegexTokenizer(
    inputCol='input', 
    outputCol='input_arr', 
    pattern='\s+|[,.\"]')

In [316]:
tok = tokenizer \
    .transform(text_data) \
    .select('input_arr') 

tok.take(2)

[Row(input_arr=['machine', 'learning', 'can', 'be', 'applied', 'to', 'a', 'wide', 'variety', 'of', 'data', 'types', 'such', 'as', 'vectors', 'text', 'images', 'and', 'structured', 'data', 'this', 'api', 'adopts', 'the', 'dataframe', 'from', 'spark', 'sql', 'in', 'order', 'to', 'support', 'a', 'variety', 'of', 'data', 'types']),
 Row(input_arr=['dataframe', 'supports', 'many', 'basic', 'and', 'structured', 'types;', 'see', 'the', 'spark', 'sql', 'datatype', 'reference', 'for', 'a', 'list', 'of', 'supported', 'types', 'in', 'addition', 'to', 'the', 'types', 'listed', 'in', 'the', 'spark', 'sql', 'guide', 'dataframe', 'can', 'use', 'ml', 'vector', 'types'])]

In [317]:
locale = sc._jvm.java.util.Locale
locale.setDefault(locale.forLanguageTag("en-US"))
stopwords = ft.StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                outputCol='input_stop')

In [318]:
stopwords.transform(tok).select('input_stop').take(1)

[Row(input_stop=['machine', 'learning', 'applied', 'wide', 'variety', 'data', 'types', 'vectors', 'text', 'images', 'structured', 'data', 'api', 'adopts', 'dataframe', 'spark', 'sql', 'order', 'support', 'variety', 'data', 'types'])]

In [319]:
ngram = ft.NGram(n=2, 
    inputCol=stopwords.getOutputCol(), 
    outputCol="nGrams")

pipeline = Pipeline(stages=[tokenizer, stopwords, ngram])

In [320]:
data_ngram = pipeline \
    .fit(text_data) \
    .transform(text_data)
    
data_ngram.select('nGrams').take(1)

[Row(nGrams=['machine learning', 'learning applied', 'applied wide', 'wide variety', 'variety data', 'data types', 'types vectors', 'vectors text', 'text images', 'images structured', 'structured data', 'data api', 'api adopts', 'adopts dataframe', 'dataframe spark', 'spark sql', 'sql order', 'order support', 'support variety', 'variety data', 'data types'])]

In [324]:
import pandas as pd
import io
import requests

url = 'https://raw.githubusercontent.com/TJmask/learningPySpark/master/Data/departuredelays.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,date,delay,distance,origin,destination
0,1011245,6,602,ABE,ATL
1,1020600,-8,369,ABE,DTW
2,1021245,-2,602,ABE,ATL
3,1020605,-4,602,ABE,ATL
4,1031245,-4,602,ABE,ATL


In [326]:
read_data = requests.get(url).content
address = pd.read_csv(io.StringIO(read_data.decode('utf-8')))
address.head()

Unnamed: 0,date,delay,distance,origin,destination
0,1011245,6,602,ABE,ATL
1,1020600,-8,369,ABE,DTW
2,1021245,-2,602,ABE,ATL
3,1020605,-4,602,ABE,ATL
4,1031245,-4,602,ABE,ATL


In [338]:
path2 = 'https://raw.githubusercontent.com/TJmask/learningPySpark/master/Data/airport-codes-na.txt'
df2 =  pd.read_csv(path2, sep="\t")##delimiter="\t", header=None
df2.head()

Unnamed: 0,City,State,Country,IATA
0,Abbotsford,BC,Canada,YXX
1,Aberdeen,SD,USA,ABR
2,Abilene,TX,USA,ABI
3,Akron,OH,USA,CAK
4,Alamosa,CO,USA,ALS


In [339]:
# df.to_csv('/Users/tjmask/Desktop/Semester2/Spark/PySpark/Datasets/departuredelays.csv')
# df2.to_csv('/Users/tjmask/Desktop/Semester2/Spark/PySpark/Datasets/airport-codes-na.csv')