# PySpark Unstructured Data Analysis: 
## Find the most common skills listed in the book of charters
### Tyler Cady 11/21/17


In [1]:
from pyspark import SparkConf, SparkContext

In [2]:
try:
    conf = SparkConf().setMaster("local[*]")
    sc = SparkContext(conf=conf)
except ValueError:
    print('Spark Context already running')

In [6]:
words = sc.textFile('Charters.txt')
type(words)

pyspark.rdd.RDD

## Check out the dataframe

In [8]:
words.take(10)

['',
 '',
 '',
 '',
 '',
 '',
 'Fall 2017 MS Internship Program',
 'MSBA Business Analytics',
 'MS Supply Chain Management',
 '']

## Convert everything to lower case

In [9]:
wordsLower = words.map(lambda line: line.lower())
wordsLower.take(10)

['',
 '',
 '',
 '',
 '',
 '',
 'fall 2017 ms internship program',
 'msba business analytics',
 'ms supply chain management',
 '']

## Split into words and flatten to make one-dimensional

In [11]:
import re # re is the regex library for python
word = wordsLower.flatMap(lambda line: re.split('\W+', line.strip()))
word.take(10)

['', '', '', '', '', '', 'fall', '2017', 'ms', 'internship']

## Filter out the empty elements

In [12]:
wordne = word.filter(lambda x: len(x) > 0)
wordne.take(10)

['fall',
 '2017',
 'ms',
 'internship',
 'program',
 'msba',
 'business',
 'analytics',
 'ms',
 'supply']

## Perform a word count

In [18]:
word1 = wordne.map(lambda word: (word,1))
wordc = word1.reduceByKey(lambda a,b: a+b)
wordcs = wordc.map(lambda x: (x[1], x[0])).sortByKey(False)
wordf = wordcs.map(lambda x: (x[1], x[0]))
wordf.take(500)

[('and', 2075),
 ('the', 2018),
 ('to', 1235),
 ('of', 1231),
 ('project', 764),
 ('will', 711),
 ('data', 698),
 ('in', 618),
 ('a', 544),
 ('student', 535),
 ('for', 483),
 ('with', 458),
 ('be', 428),
 ('is', 417),
 ('expected', 369),
 ('business', 307),
 ('on', 285),
 ('as', 276),
 ('our', 264),
 ('analytics', 251),
 ('s', 251),
 ('this', 244),
 ('team', 242),
 ('that', 190),
 ('date', 182),
 ('are', 178),
 ('management', 170),
 ('work', 165),
 ('supply', 161),
 ('we', 159),
 ('at', 158),
 ('analysis', 158),
 ('development', 157),
 ('manager', 155),
 ('chain', 147),
 ('from', 144),
 ('customer', 138),
 ('an', 132),
 ('compliance', 132),
 ('students', 124),
 ('all', 122),
 ('career', 119),
 ('other', 117),
 ('internship', 114),
 ('by', 114),
 ('have', 107),
 ('leeds', 96),
 ('or', 96),
 ('denver', 94),
 ('charter', 94),
 ('marketing', 94),
 ('responsibilities', 94),
 ('opportunities', 90),
 ('statement', 89),
 ('reporting', 88),
 ('labor', 88),
 ('not', 87),
 ('your', 86),
 ('member

## Filter out our skills from the wordf RDD

In [44]:
wordf.filter(lambda x: x[0] == "r" or x[0] == "python" 
             or x[0] == "sql" or x[0] == "pig" or x[0] == "hive" 
             or x[0] == "spark" or x[0] == "excel" or x[0] == "database" 
             or x[0] == "hadoop" or x[0] == "model" or x[0] == "modeling" 
             or x[0] == "programming" or x[0] == "cluster" 
             ).collect()




[('model', 69),
 ('modeling', 36),
 ('r', 21),
 ('sql', 20),
 ('python', 19),
 ('database', 16),
 ('hive', 10),
 ('excel', 9),
 ('programming', 7),
 ('cluster', 6),
 ('spark', 6),
 ('hadoop', 4),
 ('pig', 1)]