In [1]:
from pyspark.sql import SparkSession  
spark = SparkSession \
    .builder \
    .appName("genre-classification") \
    .config("master", "local[*]") \
    .config("spark.executor.memory", "5g")\
    .config("spark.driver.memory", "5g")\
    .config("spark.memory.offHeap.enabled",True)\
    .config("spark.memory.offHeap.size","6g")  \
    .getOrCreate()

In [2]:
data = spark.read.csv("wiki_movie_plots_deduped.csv", header=True, inferSchema = True)

In [3]:
data=data.na.drop()
#data=data.filter("genre in ('drama', 'comedy', 'horror', 'action', 'thriller','romance')")
data=data.filter("genre in ('drama', 'comedy','horror')")


In [4]:
total=data.count()


In [5]:
genre_clean=data.groupby('Genre').count().sort('count', ascending=False).withColumnRenamed("Genre", "GenreAgg")
genre_clean.toPandas()

Unnamed: 0,GenreAgg,count
0,drama,5718
1,comedy,4267
2,horror,1092


In [6]:
from pyspark.sql.functions import col
weights=genre_clean.withColumn("ratio", (total/(col("count"))))

In [7]:
data_final=data.join(weights, weights.GenreAgg==data.Genre)
data_final.select('Genre','Plot','Ratio').limit(10).toPandas()

Unnamed: 0,Genre,Plot,Ratio
0,horror,White-haired Dr. Jekyll has secretly locked hi...,10.143773
1,horror,Dr. Henry Jekyll (King Baggot) sends a note to...,10.143773
2,horror,"""Henry Jekyll (John Barrymore) is a doctor of ...",10.143773
3,horror,"The film is a contemporary (1920s, though the ...",10.143773
4,horror,The film opens with the debut of the new seaso...,10.143773
5,horror,Dick Bannister is the new field boss of the Fo...,10.143773
6,horror,"In the Latin Quarter of Paris, sculptor Margar...",10.143773
7,horror,Alonzo the Armless is a circus freak who uses ...,10.143773
8,horror,"Jim and Eve, a young society couple, are kidna...",10.143773
9,horror,Renfield (Dwight Frye) is a solicitor travelin...,10.143773


In [8]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StringIndexer
from pyspark.ml.feature import Word2Vec
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol="Plot", outputCol="tokens")
w2v = Word2Vec(vectorSize=300, minCount=0, inputCol="tokens", outputCol="features")
indexer = StringIndexer(inputCol="Genre", outputCol="label")
doc2vec_pipeline = Pipeline(stages=[tokenizer,w2v,indexer])

In [9]:
doc2vec_model = doc2vec_pipeline.fit(data_final)
doc2vecs_df = doc2vec_model.transform(data_final)
doc2vecs_df.select('tokens', 'features','label').limit(10).toPandas()

Unnamed: 0,tokens,features,label
0,"[white-haired, dr., jekyll, has, secretly, loc...","[0.04322468663438096, 0.03331412000621745, -0....",2.0
1,"[dr., henry, jekyll, (king, baggot), sends, a,...","[0.04579064731693246, 0.039869951117257836, -0...",2.0
2,"[""henry, jekyll, (john, barrymore), is, a, doc...","[0.04659355730051143, 0.04518813767119131, -0....",2.0
3,"[the, film, is, a, contemporary, (1920s,, thou...","[0.05495891580894271, 0.019276349543360993, -0...",2.0
4,"[the, film, opens, with, the, debut, of, the, ...","[0.02655054842730228, 0.022479477146564907, -0...",2.0
5,"[dick, bannister, is, the, new, field, boss, o...","[0.029137669955570326, 0.029050769960685143, -...",2.0
6,"[in, the, latin, quarter, of, paris,, sculptor...","[0.015465231206059, 0.04238198277105729, -0.03...",2.0
7,"[alonzo, the, armless, is, a, circus, freak, w...","[0.046733187741353245, 0.03459240040813969, -0...",2.0
8,"[jim, and, eve,, a, young, society, couple,, a...","[0.02537259155656102, 0.03235944381594891, -0....",2.0
9,"[renfield, (dwight, frye), is, a, solicitor, t...","[0.044561441464508675, 0.02433506120463, -0.05...",2.0


In [10]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
train, test = doc2vecs_df.randomSplit([0.7, 0.3], seed=12345)
lr_classifier = LogisticRegression(family="multinomial", weightCol="ratio")

lr_classifier_pipeline = Pipeline(stages=[lr_classifier])
lr_predictions = lr_classifier_pipeline.fit(train).transform(test)

lr_model_evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")


In [11]:
print("LogisticRegression accuracy {}".format(lr_model_evaluator.evaluate(lr_predictions)))

LogisticRegression accuracy 0.5686576354679803


In [12]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

    #.addGrid(lr_classifier.regParam, [0.1, 0.01, 0.3]) \

paramGrid = ParamGridBuilder() \
    .addGrid(lr_classifier.maxIter, [10, 100, 500, 1000]) \
    .addGrid(lr_classifier.weightCol, ["ratio"]) \
    .build()

crossval = CrossValidator(estimator=lr_classifier,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"),
                          numFolds=4)

In [13]:
cvModel = crossval.fit(train)

prediction = cvModel.transform(test)
selected = prediction.select("prediction",'label')



In [14]:
summary=cvModel.bestModel.summary
print("Logistic Regression accuracy  for best Model {}".format(cvModel.bestModel.summary.accuracy))
print("Logistic Regression f1  for best Model {}".format(cvModel.bestModel.summary.weightedFMeasure()))
print("Logistic Regression precision  for best Model {}".format(cvModel.bestModel.summary.weightedPrecision))
print("Logistic Regression recall  for best Model {}".format(cvModel.bestModel.summary.weightedRecall))


Logistic Regression accuracy  for best Model 0.6151488057223145
Logistic Regression f1  for best Model 0.6188575269485528
Logistic Regression precision  for best Model 0.6402513127782292
Logistic Regression recall  for best Model 0.6151488057223145
