In [1]:
#####Rendu Hadoop project

#Full example 1: the guess is done joining tables with the language specified, takes some time and need the language
from pyspark.sql.functions import lit
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import StringType
from pyspark.sql import functions as F

LANGUAGE="english"

#########################################################################
#Function which Reads the file generated by hadoop and create the dataframe with 2 column: the letter and the number of letters
def readHadoopFileDf(path):
  rddFreq=sc.textFile(path).map(lambda line: line.split('\t')) #we read the haddop result
  rddMap = rddFreq.map(lambda scmr:(scmr[0],int(scmr[1])))
  #We convert it to dataframe
  dfFreq = rddMap.toDF(["letter","countLetter"])#we create the dataframe
  return dfFreq

dfFreq=readHadoopFileDf("/FileStore/tables/4gv5en6u1509544751327/part_r_00000-b65ee")#We read the file returned by hadoop and create the corresponding dataframe
dfFreqEisH=readHadoopFileDf("/FileStore/tables/part_r_00003-dc956")
dfFreqCiphered3=readHadoopFileDf("/FileStore/tables/part_r_00000_CharFreqFullyCiphered3-29f14")

#Function which add a column to a dataframe with the same string on each element
def addFixedColumnToDf(dataframe,columnName,content):
  dataframe=dataframe.withColumn(columnName, lit(content))#we add the language column
  return dataframe

dfFreq=addFixedColumnToDf(dfFreq,"language",LANGUAGE) #We add a column to the dataframe containing "english"
dfFreqEisH=addFixedColumnToDf(dfFreqEisH,"language",LANGUAGE) #We add a column to the dataframe containing "english"
dfCiphered3=addFixedColumnToDf(dfFreqCiphered3,"language",LANGUAGE) #We add a column to the dataframe containing "english"

dfFreq.show()
dfFreqEisH.show()
dfFreqCiphered3.show()

#This function generates a dataframe containing 2 column: the country & the most used letter in that language
def generateMostFreqLetterByCountryDF():
  set = [('english','e'),('french','e'),('spanish','e'),('dutch','e'),('swedish','e'),('danish','e'),('portuguese','a'),('italian','a')]
  rdd = sc.parallelize(set)
  dfLang = rdd.map(lambda x: (x[0], x[1]))
  dfLang = dfLang.toDF(["language","mostFrequentLetter"])
  return dfLang

dfLang=generateMostFreqLetterByCountryDF() #We generate the table containing the letter the most used in the languages we selected (can be extanded)
dfLang.show(10)


#######################################################################First method, takes a lot of times (because of the joins), was optimised by the guess function
#We select the max value and put it in a dataframe with the letter the most represented in that language
spark.conf.set("spark.sql.crossJoin.enabled", "true")
def joinLangAndOutputDF(dfMR,dfTableLang):
  frequencyResult=dfMR.groupby("language").agg(F.max("countLetter")).toDF("language","countLetter") #create a dataframe corresponding to the max value of count
  dfFreqMax=dfMR.join(frequencyResult, ["countLetter","language"])
  dfResult = dfTableLang.join(dfFreqMax, ["language"])
  return dfResult

charDistDf=joinLangAndOutputDF(dfFreq,dfLang)
charDistDf2=joinLangAndOutputDF(dfFreqEisH,dfLang)
charDistDf3=joinLangAndOutputDF(dfCiphered3,dfLang)

#########################


#this function can code and decode,if the entry is letter it return the letter + SHIFT letter (in a circular list) 
# else it returns the letter given first. It implements the Cesar coding
def cypherLetter(x,value):
  SHIFT=value
  if ((ord(x) <= ord("Z")) and (ord(x) >= ord("A"))):  # we test if it is an upercase letter
    if ((ord(x) + SHIFT) <= ord("Z")) and ((ord(x) + SHIFT) >= ord("A")):  # the new letter stay in the interval
      return chr(ord(x) + SHIFT)
    else: # if it is outside of the interval
      if ((ord(x) + SHIFT) > ord("Z")):  # the new letter is outside the right part of the interval
        return chr(ord("A") -1 + (ord(x) + SHIFT - ord("Z")))
      else:  # the new letter is outside the left part of the interval
        return chr(ord("Z") +1 + (ord(x) + SHIFT - ord("A")))

  if ((ord(x) <= ord("z")) and (ord(x) >= ord("a"))):
    if ((ord(x) + SHIFT) <= ord("z")) and ((ord(x) + SHIFT) >= ord("a")):
      return chr(ord(x) + SHIFT)
    else:  # It is outside of the interval
      if ((ord(x) + SHIFT) > ord("z")):  # the new letter is outside the right part of the interval
        return chr(ord("a") -1 + (ord(x) + SHIFT - ord("z")))
      else: # the new letter is outside the left part of the interval
        return chr(ord("z") +1 + (ord(x) + SHIFT - ord("a")))
  else:  # it is not a letter
    return x
                   
    
def cypherDataframeColumn(columnName,value,dataframe):
  udf = UserDefinedFunction(lambda x: cypherLetter(x,value), StringType())
  new_df = dataframe.select(*[udf(column).alias(columnName) if column == columnName else column for column in dataframe.columns])
  return new_df

#returns the number of char separing the reference from the other char (ex : getCharDiff("a","c") = -2)
def getCharDiff(charMessage,charReference): 
  return ord(charMessage)-ord(charReference)


charFreqText=charDistDf.select('letter').collect()[0].letter #We get the letter the most present in our text
charFreqLang=charDistDf.select('mostFrequentLetter').collect()[0].mostFrequentLetter #We get the letter the most present normally in that language
cypheringKey=getCharDiff(charFreqText,charFreqLang) #We compute the "distance" between the most frequent letter in our language and our text = cyphering key

charFreqText2=charDistDf2.select('letter').collect()[0].letter #We get the letter the most present in our text
charFreqLang2=charDistDf2.select('mostFrequentLetter').collect()[0].mostFrequentLetter #We get the letter the most present normally in that language
cypheringKey2=getCharDiff(charFreqText2,charFreqLang2) #We compute the "distance" between the most frequent letter in our language and our text = cyphering key

charFreqText3=charDistDf3.select('letter').collect()[0].letter #We get the letter the most present in our text
charFreqLang3=charDistDf3.select('mostFrequentLetter').collect()[0].mostFrequentLetter #We get the letter the most present normally in that language
cypheringKey3=getCharDiff(charFreqText3,charFreqLang3) #We compute the "distance" between the most frequent letter in our language and our text = cyphering key

print("LETTER DIFFERENCE FOR ORIGINAL TEXT")
print(cypheringKey)
print("LETTER DIFFERENCE FOR MODIFIED TEXT")
print(cypheringKey2)
print("LETTER DIFFERENCE FOR FULLY MODIFIED TEXT")
print(cypheringKey3)

###########################################We test our cyphering/deciphering method on columns of the dataframe
result2= cypherDataframeColumn("letter",4,charDistDf)#we cypher a column
result2.show()

diffRefChar=getCharDiff("a","c")
print(diffRefChar)

result3= cypherDataframeColumn("letter",-diffRefChar,result2)#we decypher that column
result3.show()

print("Original Object")
charDistDf2.show()
result3= cypherDataframeColumn("letter",-cypheringKey2,charDistDf2)#we decypher that column
print("Object decyphered")
result3.show()
###########################################

In [2]:
#Example 2: guessing language and key

#returns the number of char separing the reference from the other char (ex : getCharDiff("a","c") = -2)
def getCharDiff(charMessage,charReference): 
  return ord(charMessage)-ord(charReference)

def readHadoopFileDf(path):
  rddFreq=sc.textFile(path).map(lambda line: line.split('\t')) #we read the haddop result
  rddMap = rddFreq.map(lambda scmr:(scmr[0],int(scmr[1])))
  #We convert it to dataframe
  dfFreq = rddMap.toDF(["letter","countLetter"])#we create the dataframe
  return dfFreq

dfFreq=readHadoopFileDf("/FileStore/tables/4gv5en6u1509544751327/part_r_00000-b65ee")#We read the file returned by hadoop and create the corresponding dataframe
dfFreqEisH=readHadoopFileDf("/FileStore/tables/part_r_00003-dc956")
dfFreqCiphered3=readHadoopFileDf("/FileStore/tables/part_r_00000_CharFreqFullyCiphered3-29f14")

#Function wich returns the language and the cyphering key of the dataframe (based on the 4 most frequent letters here). If no language is detected precisly, it returns "NoLanguage" as the language and 0 as the cyphering key
def guessLanguageAndCypheringKey(dataframe):
  resultLangKey = [("NoLanguage"), ('0')]

  dataframe=dataframe.sort(dataframe.countLetter, ascending=False)#We sort the letters by descending order
  #display(dfFreqCiphered3)

  #We get the 4 most used letters. Here we compute the 10 first one as we have them all in our list but we will only use 4 of them (we can scale it up to 10 then)
  charLetter11=dataframe.select('letter').collect()[0].letter #We get the letter the most present in our text
  charLetter12=dataframe.select('letter').collect()[1].letter 
  charLetter13=dataframe.select('letter').collect()[2].letter 
  charLetter14=dataframe.select('letter').collect()[3].letter 
  charLetter15=dataframe.select('letter').collect()[4].letter 
  charLetter16=dataframe.select('letter').collect()[5].letter 
  charLetter17=dataframe.select('letter').collect()[6].letter 
  charLetter18=dataframe.select('letter').collect()[7].letter 
  charLetter19=dataframe.select('letter').collect()[8].letter 
  charLetter110=dataframe.select('letter').collect()[9].letter 

  #We create our object storing the most used letters by order in each of the 8 languages we selected (can easily be extended)
  set = [('english','e','t','a','o','i','n','s','h','r','d'),('french','e','s','a','i','t','n','r','u','o','l'),('spanish','e','a','o','s','r','n','i','d','l','t'),('dutch','e','n','a','t','i','r','o','d','s','l'),('german','e','n','s','r','i','a','t','d','h','u'),('danish','e','r','n','t','a','i','d','s','l','o'),('portuguese','a','e','o','s','r','i','d','m','n','t'),('italian','e','a','i','o','n','l','r','t','s','c')]
  rdd = sc.parallelize(set)

  """
  #if we want to compute the full distances regarding the 10 most used letter in our languages
  rddLang = rdd.map(lambda x: (x[0], getCharDiff(charLetter11,x[1]), getCharDiff(charLetter12,x[2]), getCharDiff(charLetter13,x[3]), getCharDiff(charLetter14,x[4]), getCharDiff(charLetter15,x[5]), getCharDiff(charLetter16,x[6]),getCharDiff(charLetter17,x[7]),getCharDiff(charLetter18,x[8]),getCharDiff(charLetter19,x[9]),getCharDiff(charLetter110,x[10])))
  dfTableLang = rddLang.toDF(["Language","Letter1","Letter2","Letter3","Letter4","Letter5","Letter6","Letter7","Letter8","Letter9","Letter10"])
  #display(dfTableLang)

  #if we want to compute the distances regarding only the 4 most used letters
  rddLang4=rdd.map(lambda x: (x[0], getCharDiff(charLetter11,x[1]), getCharDiff(charLetter12,x[2]), getCharDiff(charLetter13,x[3]), getCharDiff(charLetter14,x[4])))
  dfTableLang4=rddLang4.toDF(["Language","Letter1","Letter2","Letter3","Letter4"])
  #display(dfTableLang4)
  """

  dfLanguageKey = rdd.map(lambda x: (x[0], (getCharDiff(charLetter11,x[1]) if (getCharDiff(charLetter11,x[1]) == getCharDiff(charLetter12,x[2]) and getCharDiff(charLetter12,x[2]) == getCharDiff(charLetter13,x[3]) and getCharDiff(charLetter13,x[3]) == getCharDiff(charLetter14,x[4])) else (-1000)))).toDF(["language","cypherKey"])#if the distance between the most frequent letter by order by language is the same, we put a 1 it is that language. (if it is the same language the distance should be the same on each letter)

  #the function getCharDiff can only return between -25 and 25 so puting -1000 as the default value ensure us that our element being not the good language won't be at first position when we sort them, that is to say the good
  #language is in first position
  dfLanguageKey=dfLanguageKey.sort(dfLanguageKey.cypherKey, ascending=False)#we sort our dataFrame
  #display(dfLanguageKey)

  languageGuessed=dfLanguageKey.select('language').collect()[0].language #we extract the language detected
  cipheringKeyGuessed=dfLanguageKey.select('cypherKey').collect()[0].cypherKey #we extract the cypheringKey

  #By default, if no language is detected precisly, the functions return a cyphering key of 0 and a language of "NoLanguage"
  if (cipheringKeyGuessed!= -1000):
    resultLangKey[0]=languageGuessed
    resultLangKey[1]=cipheringKeyGuessed
    
  return (resultLangKey)

langCypherKeyListPlainText=guessLanguageAndCypheringKey(dfFreq)
langCypherKeyListEisH=guessLanguageAndCypheringKey(dfFreqEisH)
langCypherKeyListCipheredText3=guessLanguageAndCypheringKey(dfFreqCiphered3) #will contain the result of the guess: the language AND the cypheringKey

print("LETTER DIFFERENCE FOR ORIGINAL TEXT")
print(langCypherKeyListPlainText[1])
print("LETTER DIFFERENCE FOR MODIFIED TEXT (E is H) => should return 0 since only one letter has been modified")
print(langCypherKeyListEisH[1])
print("LETTER DIFFERENCE FOR FULLY MODIFIED TEXT")
print(langCypherKeyListCipheredText3[1])
print("LANGUAGE DETECTED")
print(langCypherKeyListCipheredText3[0])