###IMPORTING NECESSARY MODULES AND PACKAGES

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix
import re
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
from sklearn.svm import SVC

###TRAINING AND dev SET

In [None]:
def read_conll(in_file, lowercase=False, max_example=None):
    csv_list = []
    with open(in_file) as f:
        for line in f.readlines():
            csv=[]
            sp = line.strip().split('\t')
            if len(sp)>1:
               csv.append(sp[0])
               csv.append(sp[1])
            csv_list.append(csv)
    return pd.DataFrame(csv_list)

In [None]:
lid_train=read_conll("/content/drive/MyDrive/PRISM ENG-HIN VIT/lid_spaeng/train.conll") # lid only data
temp=lid_train
# combined data of lid+kaggle
lid_dev=read_conll("/content/drive/MyDrive/PRISM ENG-HIN VIT/lid_spaeng/dev.conll") # lid dev data
#lid_dev_bin=pd.read_csv("/content/drive/MyDrive/ML JCOMP SEM5/1A_English_actual_labels.csv") # lid binary dev labels
#lid_dev_mul=pd.read_csv("/content/drive/MyDrive/ML JCOMP SEM5/1B_English_actual_labels.csv") #lid multiclass dev labels

In [None]:
lid_train

Unnamed: 0,0,1
0,,
1,11:11,other
2,.....,other
3,make,lang1
4,a,lang1
...,...,...
295276,borrega,lang2
295277,pelos,lang2
295278,necios,lang2
295279,!!!!!,other


In [None]:
lid_train[1] = np.where(lid_train[1] == "fw", "other",lid_train[1])
lid_train[1] = np.where(lid_train[1] == "mixed", "other",lid_train[1])
lid_train[1] = np.where(lid_train[1] == "ambiguous", "other",lid_train[1])
lid_train[1] = np.where(lid_train[1] == "unk", "other",lid_train[1])

lid_dev[1] = np.where(lid_dev[1] == "fw", "other",lid_dev[1])
lid_dev[1] = np.where(lid_dev[1] == "mixed", "other",lid_dev[1])
lid_dev[1] = np.where(lid_dev[1] == "ambiguous", "other",lid_dev[1])
lid_dev[1] = np.where(lid_dev[1] == "unk", "other",lid_dev[1])



In [None]:
lid_train[1].value_counts()*100/lid_train.shape[0]

lang2    38.264568
lang1    27.240832
other    18.493909
ne        1.756632
Name: 1, dtype: float64

In [None]:
lid_train.columns=["word","label"]
lid_dev.columns=["word","label"]
lid_train=lid_train[(lid_train["word"].isnull()==False)]

In [None]:
lid_dev=lid_dev[(lid_dev["word"].isnull()==False)]

## multiclass dev data

In [None]:
lid_dev.head()

Unnamed: 0,word,label
1,@_easanti,other
2,@mememecaigo,other
3,#todossomoscarlosperez,other
4,#hashtaglargo,lang2
7,I,lang1


##PREPROCESSING AND CLEANING
####1.LOWER CASE CONVERTING
####2.REMOVING BAD SYMBOLS
####3.STOP WORD REMOVER
####4.PUNCTUATION REMOVER
####5.TOKENIZATION
####6.LEMMATIZATION
####7.REMOVE USERNAMES 


In [None]:
lemmatizer = WordNetLemmatizer()
stop_words= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't","not"])


In [None]:
def preprocess(tweets):
  
  tweets.encode('ascii', 'ignore').decode('ascii') #remove emojis
  tweets=tweets.lower()                   #convert to lower case
  tweets = re.sub(r"http\S+", " ", tweets) #remove urls
  l=tweets.split(" ")
  for n,i in enumerate(l):
    if '@' in i:
      l[n]=""
  
  tweets=" ".join(l)
  tweets= re.sub('[^A-Za-z]+',' ',tweets) #remove bad character
  tweets = [word for word in tweets.split(" ") if not word in stop_words] # removing stop words
  tweets= [lemmatizer.lemmatize(token, "v") for token in tweets] #Lemmatization
  tweets=" ".join(tweets) 
  return tweets 
def decontract(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text


## PREPROCESSING THE TRAIN DATA

WHILE USING ONLY lid DATA

In [None]:
lid_train["word"]=lid_train["word"].apply(lambda x:decontract(x))
lid_train["word"]=lid_train["word"].apply(lambda x:preprocess(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## PREPROCESSING THE dev DATA

In [None]:
lid_dev["word"]=lid_dev["word"].apply(lambda x:decontract(x))
lid_dev["word"]=lid_dev["word"].apply(lambda x:preprocess(x))

In [None]:
le=LabelEncoder()

for multiclass data

In [None]:
lid_train["label"] = le.fit_transform(lid_train["label"])
lid_dev["label"] = le.fit_transform(lid_dev["label"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
lid_train["label"].unique()

array([3, 0, 1, 2])

##VECTORIZING THE DATA

USING TFIDF VECTORIZATION for ngrams

In [None]:
vectorizer1 = TfidfVectorizer(ngram_range = (1,1)) #ONLY UNIGRAMS , 
#vectorizer1 = TfidfVectorizer(ngram_range = (2,2)) #ONLY BIGRAM , 
#vectorizer1 = TfidfVectorizer(ngram_range = (1,2)) #UNI + BI , 

#full_train_vectors=vectorizer1.fit_transform(full_train_tweet)
train_vectors=vectorizer1.fit_transform(lid_train["word"])
#val_vectors=vectorizer1.transform(val_tweet)

In [None]:
dev_vectors=vectorizer1.transform(lid_dev["word"])

In [None]:
train_vectors

<253221x20616 sparse matrix of type '<class 'numpy.float64'>'
	with 155873 stored elements in Compressed Sparse Row format>

## for multiclass

In [None]:
t1=lid_train["label"]
t2=lid_dev["label"]


In [None]:
from sklearn.metrics import classification_report

##using OPTIMUM KERNEL AND PARAMETERS FOR SVM

In [None]:
kernels = ['Polynomial', 'RBF', 'Sigmoid','Linear']#A function which returns the corresponding SVC model
def getClassifier(ktype):
    if ktype == 0:
        # Polynomial kernal
        return SVC(kernel='poly', degree=8, gamma="auto")
    elif ktype == 1:
        # Radial Basis Function kernal
        return SVC(kernel='rbf', gamma="auto")
    elif ktype == 2:
        # Sigmoid kernal
        return SVC(kernel='sigmoid', gamma="auto")
    elif ktype == 3:
        # Linear kernal
        return SVC(kernel='linear', gamma="auto")

### PERFORMANCE ON lid train DATA 

In [29]:
model=SVC(kernel='linear',gamma='auto')

In [30]:
model.fit(train_vectors,t1)

SVC(gamma='auto', kernel='linear')

In [None]:
svclassifier_list=[]
for i in range(4):
    # Separate data into dev and training sets
    svclassifier = getClassifier(i) 
    svclassifier.fit(train_vectors,t1)# Make prediction
    svclassifier_list.append(svclassifier)
    y_pred = svclassifier.predict(train_vectors)# Evaluate our model
    print("Evaluation:", kernels[i], "kernel")
    print(classification_report(t1,y_pred))

Evaluation: Polynomial kernel


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00     80437
           1       0.45      1.00      0.62    112988
           2       0.00      0.00      0.00      5187
           3       0.00      0.00      0.00     54609

    accuracy                           0.45    253221
   macro avg       0.11      0.25      0.15    253221
weighted avg       0.20      0.45      0.28    253221



KeyboardInterrupt: ignored

performance on dev data

In [31]:
y_pred=model.predict(dev_vectors)


'              precision    recall  f1-score   support\n\n           0       0.97      0.48      0.64     16712\n           1       0.98      0.83      0.90     14955\n           2       0.88      0.51      0.64       815\n           3       0.41      0.99      0.58      7909\n\n    accuracy                           0.71     40391\n   macro avg       0.81      0.70      0.69     40391\nweighted avg       0.86      0.71      0.72     40391\n'

In [32]:
print(classification_report(t2,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.48      0.64     16712
           1       0.98      0.83      0.90     14955
           2       0.88      0.51      0.64       815
           3       0.41      0.99      0.58      7909

    accuracy                           0.71     40391
   macro avg       0.81      0.70      0.69     40391
weighted avg       0.86      0.71      0.72     40391



In [None]:
for i in range(4):
    # Separate data into dev and training sets
    y_pred = svclassifier_list[i].predict(dev_vectors)# Evaluate our model
    print("Evaluation:", kernels[i], "kernel")
    print(classification_report(t2,y_pred))

NameError: ignored